git.chise.org Git - chise/xemacs-chise.git-/blob - src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #ifdef MULE
  34 #include "mule-ccl.h"
  35 #include "chartab.h"
  36 #endif
  37 #include "file-coding.h"
  38
  39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  40
  41 Lisp_Object Vkeyboard_coding_system;
  42 Lisp_Object Vterminal_coding_system;
  43 Lisp_Object Vcoding_system_for_read;
  44 Lisp_Object Vcoding_system_for_write;
  45 Lisp_Object Vfile_name_coding_system;
  46
  47 /* Table of symbols identifying each coding category. */
  48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  49
  50 /* Coding system currently associated with each coding category. */
  51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  52
  53 /* Table of all coding categories in decreasing order of priority.
  54    This describes a permutation of the possible coding categories. */
  55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  56
  57 Lisp_Object Qcoding_system_p;
  58
  59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  60 /* Qinternal in general.c */
  61
  62 Lisp_Object Qmnemonic, Qeol_type;
  63 Lisp_Object Qcr, Qcrlf, Qlf;
  64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  65 Lisp_Object Qpost_read_conversion;
  66 Lisp_Object Qpre_write_conversion;
  67
  68 #ifdef MULE
  69 Lisp_Object Qucs4, Qutf8;
  70 Lisp_Object Qbig5, Qshift_jis;
  71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  74 Lisp_Object Qno_iso6429;
  75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  76 Lisp_Object Qctext, Qescape_quoted;
  77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  78 #endif
  79 Lisp_Object Qencode, Qdecode;
  80
  81 Lisp_Object Vcoding_system_hash_table;
  82
  83 int enable_multibyte_characters;
  84
  85 #ifdef MULE
  86 /* Additional information used by the ISO2022 decoder and detector. */
  87 struct iso2022_decoder
  88 {
  89   /* CHARSET holds the character sets currently assigned to the G0
  90      through G3 variables.  It is initialized from the array
  91      INITIAL_CHARSET in CODESYS. */
  92   Lisp_Object charset[4];
  93
  94   /* Which registers are currently invoked into the left (GL) and
  95      right (GR) halves of the 8-bit encoding space? */
  96   int register_left, register_right;
  97
  98   /* ISO_ESC holds a value indicating part of an escape sequence
  99      that has already been seen. */
 100   enum iso_esc_flag esc;
 101
 102   /* This records the bytes we've seen so far in an escape sequence,
 103      in case the sequence is invalid (we spit out the bytes unchanged). */
 104   unsigned char esc_bytes[8];
 105
 106   /* Index for next byte to store in ISO escape sequence. */
 107   int esc_bytes_index;
 108
 109 #ifdef ENABLE_COMPOSITE_CHARS
 110   /* Stuff seen so far when composing a string. */
 111   unsigned_char_dynarr *composite_chars;
 112 #endif
 113
 114   /* If we saw an invalid designation sequence for a particular
 115      register, we flag it here and switch to ASCII.  The next time we
 116      see a valid designation for this register, we turn off the flag
 117      and do the designation normally, but pretend the sequence was
 118      invalid.  The effect of all this is that (most of the time) the
 119      escape sequences for both the switch to the unknown charset, and
 120      the switch back to the known charset, get inserted literally into
 121      the buffer and saved out as such.  The hope is that we can
 122      preserve the escape sequences so that the resulting written out
 123      file makes sense.  If we don't do any of this, the designation
 124      to the invalid charset will be preserved but that switch back
 125      to the known charset will probably get eaten because it was
 126      the same charset that was already present in the register. */
 127   unsigned char invalid_designated[4];
 128
 129   /* We try to do similar things as above for direction-switching
 130      sequences.  If we encountered a direction switch while an
 131      invalid designation was present, or an invalid designation
 132      just after a direction switch (i.e. no valid designation
 133      encountered yet), we insert the direction-switch escape
 134      sequence literally into the output stream, and later on
 135      insert the corresponding direction-restoring escape sequence
 136      literally also. */
 137   unsigned int switched_dir_and_no_valid_charset_yet :1;
 138   unsigned int invalid_switch_dir :1;
 139
 140   /* Tells the decoder to output the escape sequence literally
 141      even though it was valid.  Used in the games we play to
 142      avoid lossage when we encounter invalid designations. */
 143   unsigned int output_literally :1;
 144   /* We encountered a direction switch followed by an invalid
 145      designation.  We didn't output the direction switch
 146      literally because we didn't know about the invalid designation;
 147      but we have to do so now. */
 148   unsigned int output_direction_sequence :1;
 149 };
 150 #endif /* MULE */
 151 EXFUN (Fcopy_coding_system, 2);
 152 #ifdef MULE
 153 struct detection_state;
 154
 155 static void
 156 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
 157                      unsigned_char_dynarr *dst, unsigned int n);
 158
 159 static int detect_coding_sjis (struct detection_state *st,
 160                                CONST unsigned char *src,
 161                                unsigned int n);
 162 static void decode_coding_sjis (Lstream *decoding,
 163                                 CONST unsigned char *src,
 164                                 unsigned_char_dynarr *dst,
 165                                 unsigned int n);
 166 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 167                             unsigned_char_dynarr *dst, unsigned int *flags);
 168 void char_finish_shift_jis (struct encoding_stream *str,
 169                             unsigned_char_dynarr *dst, unsigned int *flags);
 170
 171 static int detect_coding_big5 (struct detection_state *st,
 172                                CONST unsigned char *src,
 173                                unsigned int n);
 174 static void decode_coding_big5 (Lstream *decoding,
 175                                 CONST unsigned char *src,
 176                                 unsigned_char_dynarr *dst, unsigned int n);
 177 static void encode_coding_big5 (Lstream *encoding,
 178                                 CONST unsigned char *src,
 179                                 unsigned_char_dynarr *dst, unsigned int n);
 180 static int detect_coding_ucs4 (struct detection_state *st,
 181                                CONST unsigned char *src,
 182                                unsigned int n);
 183 static void decode_coding_ucs4 (Lstream *decoding,
 184                                 CONST unsigned char *src,
 185                                 unsigned_char_dynarr *dst, unsigned int n);
 186 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 187                        unsigned_char_dynarr *dst, unsigned int *flags);
 188 void char_finish_ucs4 (struct encoding_stream *str,
 189                        unsigned_char_dynarr *dst, unsigned int *flags);
 190
 191 static int detect_coding_utf8 (struct detection_state *st,
 192                                CONST unsigned char *src,
 193                                unsigned int n);
 194 static void decode_coding_utf8 (Lstream *decoding,
 195                                 CONST unsigned char *src,
 196                                 unsigned_char_dynarr *dst, unsigned int n);
 197 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 198                        unsigned_char_dynarr *dst, unsigned int *flags);
 199 void char_finish_utf8 (struct encoding_stream *str,
 200                        unsigned_char_dynarr *dst, unsigned int *flags);
 201
 202 static int postprocess_iso2022_mask (int mask);
 203 static void reset_iso2022 (Lisp_Object coding_system,
 204                            struct iso2022_decoder *iso);
 205 static int detect_coding_iso2022 (struct detection_state *st,
 206                                   CONST unsigned char *src,
 207                                   unsigned int n);
 208 static void decode_coding_iso2022 (Lstream *decoding,
 209                                    CONST unsigned char *src,
 210                                    unsigned_char_dynarr *dst, unsigned int n);
 211 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 212                           unsigned_char_dynarr *dst, unsigned int *flags);
 213 void char_finish_iso2022 (struct encoding_stream *str,
 214                           unsigned_char_dynarr *dst, unsigned int *flags);
 215 #endif /* MULE */
 216 static void decode_coding_no_conversion (Lstream *decoding,
 217                                          CONST unsigned char *src,
 218                                          unsigned_char_dynarr *dst,
 219                                          unsigned int n);
 220 static void encode_coding_no_conversion (Lstream *encoding,
 221                                          CONST unsigned char *src,
 222                                          unsigned_char_dynarr *dst,
 223                                          unsigned int n);
 224 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 225                          unsigned_char_dynarr *dst, unsigned int n);
 226 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 227                          unsigned_char_dynarr *dst, unsigned int n);
 228
 229 typedef struct codesys_prop codesys_prop;
 230 struct codesys_prop
 231 {
 232   Lisp_Object sym;
 233   int prop_type;
 234 };
 235
 236 typedef struct
 237 {
 238   Dynarr_declare (codesys_prop);
 239 } codesys_prop_dynarr;
 240
 241 codesys_prop_dynarr *the_codesys_prop_dynarr;
 242
 243 enum codesys_prop_enum
 244 {
 245   CODESYS_PROP_ALL_OK,
 246   CODESYS_PROP_ISO2022,
 247   CODESYS_PROP_CCL
 248 };
 249
 250 \f
 251 /************************************************************************/
 252 /*                       Coding system functions                        */
 253 /************************************************************************/
 254
 255 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 256 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 257 static void finalize_coding_system (void *header, int for_disksave);
 258
 259 #ifdef MULE
 260 static const struct lrecord_description ccs_description_1[] = {
 261   { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
 262   { XD_END }
 263 };
 264
 265 static const struct struct_description ccs_description = {
 266   sizeof(charset_conversion_spec),
 267   ccs_description_1
 268 };
 269
 270 static const struct lrecord_description ccsd_description_1[] = {
 271   XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 272   { XD_END }
 273 };
 274
 275 static const struct struct_description ccsd_description = {
 276   sizeof(charset_conversion_spec_dynarr),
 277   ccsd_description_1
 278 };
 279 #endif
 280
 281 static const struct lrecord_description coding_system_description[] = {
 282   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
 283   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
 284   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
 285 #ifdef MULE
 286   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
 287   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 288   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 289   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
 290 #endif
 291   { XD_END }
 292 };
 293
 294 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 295                                mark_coding_system, print_coding_system,
 296                                finalize_coding_system,
 297                                0, 0, coding_system_description,
 298                                struct Lisp_Coding_System);
 299
 300 static Lisp_Object
 301 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 302 {
 303   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 304
 305   markobj (CODING_SYSTEM_NAME (codesys));
 306   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 307   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 308   markobj (CODING_SYSTEM_EOL_LF (codesys));
 309   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 310   markobj (CODING_SYSTEM_EOL_CR (codesys));
 311
 312   switch (CODING_SYSTEM_TYPE (codesys))
 313     {
 314 #ifdef MULE
 315       int i;
 316     case CODESYS_ISO2022:
 317       for (i = 0; i < 4; i++)
 318         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 319       if (codesys->iso2022.input_conv)
 320         {
 321           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 322             {
 323               struct charset_conversion_spec *ccs =
 324                 Dynarr_atp (codesys->iso2022.input_conv, i);
 325               markobj (ccs->from_charset);
 326               markobj (ccs->to_charset);
 327             }
 328         }
 329       if (codesys->iso2022.output_conv)
 330         {
 331           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 332             {
 333               struct charset_conversion_spec *ccs =
 334                 Dynarr_atp (codesys->iso2022.output_conv, i);
 335               markobj (ccs->from_charset);
 336               markobj (ccs->to_charset);
 337             }
 338         }
 339       break;
 340
 341     case CODESYS_CCL:
 342       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 343       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 344       break;
 345 #endif /* MULE */
 346     default:
 347       break;
 348     }
 349
 350   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 351   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 352 }
 353
 354 static void
 355 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 356                      int escapeflag)
 357 {
 358   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 359   if (print_readably)
 360     error ("printing unreadable object #<coding_system 0x%x>",
 361            c->header.uid);
 362
 363   write_c_string ("#<coding_system ", printcharfun);
 364   print_internal (c->name, printcharfun, 1);
 365   write_c_string (">", printcharfun);
 366 }
 367
 368 static void
 369 finalize_coding_system (void *header, int for_disksave)
 370 {
 371   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 372   /* Since coding systems never go away, this function is not
 373      necessary.  But it would be necessary if we changed things
 374      so that coding systems could go away. */
 375   if (!for_disksave) /* see comment in lstream.c */
 376     {
 377       switch (CODING_SYSTEM_TYPE (c))
 378         {
 379 #ifdef MULE
 380         case CODESYS_ISO2022:
 381           if (c->iso2022.input_conv)
 382             {
 383               Dynarr_free (c->iso2022.input_conv);
 384               c->iso2022.input_conv = 0;
 385             }
 386           if (c->iso2022.output_conv)
 387             {
 388               Dynarr_free (c->iso2022.output_conv);
 389               c->iso2022.output_conv = 0;
 390             }
 391           break;
 392 #endif /* MULE */
 393         default:
 394           break;
 395         }
 396     }
 397 }
 398
 399 static enum eol_type
 400 symbol_to_eol_type (Lisp_Object symbol)
 401 {
 402   CHECK_SYMBOL (symbol);
 403   if (NILP (symbol))      return EOL_AUTODETECT;
 404   if (EQ (symbol, Qlf))   return EOL_LF;
 405   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 406   if (EQ (symbol, Qcr))   return EOL_CR;
 407
 408   signal_simple_error ("Unrecognized eol type", symbol);
 409   return EOL_AUTODETECT; /* not reached */
 410 }
 411
 412 static Lisp_Object
 413 eol_type_to_symbol (enum eol_type type)
 414 {
 415   switch (type)
 416     {
 417     default: abort ();
 418     case EOL_LF:         return Qlf;
 419     case EOL_CRLF:       return Qcrlf;
 420     case EOL_CR:         return Qcr;
 421     case EOL_AUTODETECT: return Qnil;
 422     }
 423 }
 424
 425 static void
 426 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 427 {
 428   Lisp_Object codesys_obj;
 429   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 430   char *codesys_name = (char *) alloca (len + 7);
 431   int mlen = -1;
 432   char *codesys_mnemonic=0;
 433
 434   Lisp_Object codesys_name_sym, sub_codesys_obj;
 435
 436   /* kludge */
 437
 438   XSETCODING_SYSTEM (codesys_obj, codesys);
 439
 440   memcpy (codesys_name,
 441           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 442
 443   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 444     {
 445       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 446       codesys_mnemonic = (char *) alloca (mlen + 7);
 447       memcpy (codesys_mnemonic,
 448               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 449     }
 450
 451 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 452   strcpy (codesys_name + len, "-" op_sys);                                      \
 453   if (mlen != -1)                                                               \
 454     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 455   codesys_name_sym = intern (codesys_name);                                     \
 456   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 457   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 458   if (mlen != -1)                                                               \
 459     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 460       build_string (codesys_mnemonic);                                          \
 461   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 462 } while (0)
 463
 464   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 465   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 466   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 467 }
 468
 469 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 470 Return t if OBJECT is a coding system.
 471 A coding system is an object that defines how text containing multiple
 472 character sets is encoded into a stream of (typically 8-bit) bytes.
 473 The coding system is used to decode the stream into a series of
 474 characters (which may be from multiple charsets) when the text is read
 475 from a file or process, and is used to encode the text back into the
 476 same format when it is written out to a file or process.
 477
 478 For example, many ISO2022-compliant coding systems (such as Compound
 479 Text, which is used for inter-client data under the X Window System)
 480 use escape sequences to switch between different charsets -- Japanese
 481 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 482 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 483 `make-coding-system' for more information.
 484
 485 Coding systems are normally identified using a symbol, and the
 486 symbol is accepted in place of the actual coding system object whenever
 487 a coding system is called for. (This is similar to how faces work.)
 488 */
 489        (object))
 490 {
 491   return CODING_SYSTEMP (object) ? Qt : Qnil;
 492 }
 493
 494 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 495 Retrieve the coding system of the given name.
 496
 497 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 498 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 499 If there is no such coding system, nil is returned.  Otherwise the
 500 associated coding system object is returned.
 501 */
 502        (coding_system_or_name))
 503 {
 504   if (CODING_SYSTEMP (coding_system_or_name))
 505     return coding_system_or_name;
 506
 507   if (NILP (coding_system_or_name))
 508     coding_system_or_name = Qbinary;
 509   else
 510     CHECK_SYMBOL (coding_system_or_name);
 511
 512   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 513 }
 514
 515 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 516 Retrieve the coding system of the given name.
 517 Same as `find-coding-system' except that if there is no such
 518 coding system, an error is signaled instead of returning nil.
 519 */
 520        (name))
 521 {
 522   Lisp_Object coding_system = Ffind_coding_system (name);
 523
 524   if (NILP (coding_system))
 525     signal_simple_error ("No such coding system", name);
 526   return coding_system;
 527 }
 528
 529 /* We store the coding systems in hash tables with the names as the key and the
 530    actual coding system object as the value.  Occasionally we need to use them
 531    in a list format.  These routines provide us with that. */
 532 struct coding_system_list_closure
 533 {
 534   Lisp_Object *coding_system_list;
 535 };
 536
 537 static int
 538 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 539                                   void *coding_system_list_closure)
 540 {
 541   /* This function can GC */
 542   struct coding_system_list_closure *cscl =
 543     (struct coding_system_list_closure *) coding_system_list_closure;
 544   Lisp_Object *coding_system_list = cscl->coding_system_list;
 545
 546   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 547                                *coding_system_list);
 548   return 0;
 549 }
 550
 551 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 552 Return a list of the names of all defined coding systems.
 553 */
 554        ())
 555 {
 556   Lisp_Object coding_system_list = Qnil;
 557   struct gcpro gcpro1;
 558   struct coding_system_list_closure coding_system_list_closure;
 559
 560   GCPRO1 (coding_system_list);
 561   coding_system_list_closure.coding_system_list = &coding_system_list;
 562   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 563                  &coding_system_list_closure);
 564   UNGCPRO;
 565
 566   return coding_system_list;
 567 }
 568
 569 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 570 Return the name of the given coding system.
 571 */
 572        (coding_system))
 573 {
 574   coding_system = Fget_coding_system (coding_system);
 575   return XCODING_SYSTEM_NAME (coding_system);
 576 }
 577
 578 static Lisp_Coding_System *
 579 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 580 {
 581   Lisp_Coding_System *codesys =
 582     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 583
 584   zero_lcrecord (codesys);
 585   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 586   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 587   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 588   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 589   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 590   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 591   CODING_SYSTEM_TYPE     (codesys) = type;
 592   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 593 #ifdef MULE
 594   if (type == CODESYS_ISO2022)
 595     {
 596       int i;
 597       for (i = 0; i < 4; i++)
 598         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 599     }
 600   else if (type == CODESYS_CCL)
 601     {
 602       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 603       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 604     }
 605 #endif /* MULE */
 606   CODING_SYSTEM_NAME (codesys) = name;
 607
 608   return codesys;
 609 }
 610
 611 #ifdef MULE
 612 /* Given a list of charset conversion specs as specified in a Lisp
 613    program, parse it into STORE_HERE. */
 614
 615 static void
 616 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 617                                 Lisp_Object spec_list)
 618 {
 619   Lisp_Object rest;
 620
 621   EXTERNAL_LIST_LOOP (rest, spec_list)
 622     {
 623       Lisp_Object car = XCAR (rest);
 624       Lisp_Object from, to;
 625       struct charset_conversion_spec spec;
 626
 627       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 628         signal_simple_error ("Invalid charset conversion spec", car);
 629       from = Fget_charset (XCAR (car));
 630       to = Fget_charset (XCAR (XCDR (car)));
 631       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 632         signal_simple_error_2
 633           ("Attempted conversion between different charset types",
 634            from, to);
 635       spec.from_charset = from;
 636       spec.to_charset = to;
 637
 638       Dynarr_add (store_here, spec);
 639     }
 640 }
 641
 642 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 643    specs, return the equivalent as the Lisp programmer would see it.
 644
 645    If LOAD_HERE is 0, return Qnil. */
 646
 647 static Lisp_Object
 648 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 649 {
 650   int i;
 651   Lisp_Object result;
 652
 653   if (!load_here)
 654     return Qnil;
 655   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 656     {
 657       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 658       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 659     }
 660
 661   return Fnreverse (result);
 662 }
 663
 664 #endif /* MULE */
 665
 666 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 667 Register symbol NAME as a coding system.
 668
 669 TYPE describes the conversion method used and should be one of
 670
 671 nil or 'undecided
 672      Automatic conversion.  XEmacs attempts to detect the coding system
 673      used in the file.
 674 'no-conversion
 675      No conversion.  Use this for binary files and such.  On output,
 676      graphic characters that are not in ASCII or Latin-1 will be
 677      replaced by a ?. (For a no-conversion-encoded buffer, these
 678      characters will only be present if you explicitly insert them.)
 679 'shift-jis
 680      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 681 'ucs-4
 682      ISO 10646 UCS-4 encoding.
 683 'utf-8
 684      ISO 10646 UTF-8 encoding.
 685 'iso2022
 686      Any ISO2022-compliant encoding.  Among other things, this includes
 687      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 688      standard Unix encoding for Japanese and other languages), and
 689      Compound Text (the encoding used in X11).  You can specify more
 690      specific information about the conversion with the FLAGS argument.
 691 'big5
 692      Big5 (the encoding commonly used for Taiwanese).
 693 'ccl
 694      The conversion is performed using a user-written pseudo-code
 695      program.  CCL (Code Conversion Language) is the name of this
 696      pseudo-code.
 697 'internal
 698      Write out or read in the raw contents of the memory representing
 699      the buffer's text.  This is primarily useful for debugging
 700      purposes, and is only enabled when XEmacs has been compiled with
 701      DEBUG_XEMACS defined (via the --debug configure option).
 702      WARNING: Reading in a file using 'internal conversion can result
 703      in an internal inconsistency in the memory representing a
 704      buffer's text, which will produce unpredictable results and may
 705      cause XEmacs to crash.  Under normal circumstances you should
 706      never use 'internal conversion.
 707
 708 DOC-STRING is a string describing the coding system.
 709
 710 PROPS is a property list, describing the specific nature of the
 711 character set.  Recognized properties are:
 712
 713 'mnemonic
 714      String to be displayed in the modeline when this coding system is
 715      active.
 716
 717 'eol-type
 718      End-of-line conversion to be used.  It should be one of
 719
 720         nil
 721                 Automatically detect the end-of-line type (LF, CRLF,
 722                 or CR).  Also generate subsidiary coding systems named
 723                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 724                 identical to this coding system but have an EOL-TYPE
 725                 value of 'lf, 'crlf, and 'cr, respectively.
 726         'lf
 727                 The end of a line is marked externally using ASCII LF.
 728                 Since this is also the way that XEmacs represents an
 729                 end-of-line internally, specifying this option results
 730                 in no end-of-line conversion.  This is the standard
 731                 format for Unix text files.
 732         'crlf
 733                 The end of a line is marked externally using ASCII
 734                 CRLF.  This is the standard format for MS-DOS text
 735                 files.
 736         'cr
 737                 The end of a line is marked externally using ASCII CR.
 738                 This is the standard format for Macintosh text files.
 739         t
 740                 Automatically detect the end-of-line type but do not
 741                 generate subsidiary coding systems.  (This value is
 742                 converted to nil when stored internally, and
 743                 `coding-system-property' will return nil.)
 744
 745 'post-read-conversion
 746      Function called after a file has been read in, to perform the
 747      decoding.  Called with two arguments, BEG and END, denoting
 748      a region of the current buffer to be decoded.
 749
 750 'pre-write-conversion
 751      Function called before a file is written out, to perform the
 752      encoding.  Called with two arguments, BEG and END, denoting
 753      a region of the current buffer to be encoded.
 754
 755
 756 The following additional properties are recognized if TYPE is 'iso2022:
 757
 758 'charset-g0
 759 'charset-g1
 760 'charset-g2
 761 'charset-g3
 762      The character set initially designated to the G0 - G3 registers.
 763      The value should be one of
 764
 765           -- A charset object (designate that character set)
 766           -- nil (do not ever use this register)
 767           -- t (no character set is initially designated to
 768                 the register, but may be later on; this automatically
 769                 sets the corresponding `force-g*-on-output' property)
 770
 771 'force-g0-on-output
 772 'force-g1-on-output
 773 'force-g2-on-output
 774 'force-g2-on-output
 775      If non-nil, send an explicit designation sequence on output before
 776      using the specified register.
 777
 778 'short
 779      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 780      "ESC $ B" on output in place of the full designation sequences
 781      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 782
 783 'no-ascii-eol
 784      If non-nil, don't designate ASCII to G0 at each end of line on output.
 785      Setting this to non-nil also suppresses other state-resetting that
 786      normally happens at the end of a line.
 787
 788 'no-ascii-cntl
 789      If non-nil, don't designate ASCII to G0 before control chars on output.
 790
 791 'seven
 792      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 793      environment.
 794
 795 'lock-shift
 796      If non-nil, use locking-shift (SO/SI) instead of single-shift
 797      or designation by escape sequence.
 798
 799 'no-iso6429
 800      If non-nil, don't use ISO6429's direction specification.
 801
 802 'escape-quoted
 803      If non-nil, literal control characters that are the same as
 804      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 805      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 806      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 807      so that they can be properly distinguished from an escape sequence.
 808      (Note that doing this results in a non-portable encoding.) This
 809      encoding flag is used for byte-compiled files.  Note that ESC
 810      is a good choice for a quoting character because there are no
 811      escape sequences whose second byte is a character from the Control-0
 812      or Control-1 character sets; this is explicitly disallowed by the
 813      ISO2022 standard.
 814
 815 'input-charset-conversion
 816      A list of conversion specifications, specifying conversion of
 817      characters in one charset to another when decoding is performed.
 818      Each specification is a list of two elements: the source charset,
 819      and the destination charset.
 820
 821 'output-charset-conversion
 822      A list of conversion specifications, specifying conversion of
 823      characters in one charset to another when encoding is performed.
 824      The form of each specification is the same as for
 825      'input-charset-conversion.
 826
 827
 828 The following additional properties are recognized (and required)
 829 if TYPE is 'ccl:
 830
 831 'decode
 832      CCL program used for decoding (converting to internal format).
 833
 834 'encode
 835      CCL program used for encoding (converting to external format).
 836 */
 837        (name, type, doc_string, props))
 838 {
 839   Lisp_Coding_System *codesys;
 840   Lisp_Object rest, key, value;
 841   enum coding_system_type ty;
 842   int need_to_setup_eol_systems = 1;
 843
 844   /* Convert type to constant */
 845   if (NILP (type) || EQ (type, Qundecided))
 846                                       { ty = CODESYS_AUTODETECT; }
 847 #ifdef MULE
 848   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 849   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 850   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 851   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 852   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 853   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 854 #endif
 855   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 856 #ifdef DEBUG_XEMACS
 857   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 858 #endif
 859   else
 860     signal_simple_error ("Invalid coding system type", type);
 861
 862   CHECK_SYMBOL (name);
 863
 864   codesys = allocate_coding_system (ty, name);
 865
 866   if (NILP (doc_string))
 867     doc_string = build_string ("");
 868   else
 869     CHECK_STRING (doc_string);
 870   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 871
 872   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 873     {
 874       if (EQ (key, Qmnemonic))
 875         {
 876           if (!NILP (value))
 877             CHECK_STRING (value);
 878           CODING_SYSTEM_MNEMONIC (codesys) = value;
 879         }
 880
 881       else if (EQ (key, Qeol_type))
 882         {
 883           need_to_setup_eol_systems = NILP (value);
 884           if (EQ (value, Qt))
 885             value = Qnil;
 886           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 887         }
 888
 889       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 890       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 891 #ifdef MULE
 892       else if (ty == CODESYS_ISO2022)
 893         {
 894 #define FROB_INITIAL_CHARSET(charset_num) \
 895   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 896     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 897
 898           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 899           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 900           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 901           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 902
 903 #define FROB_FORCE_CHARSET(charset_num) \
 904   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 905
 906           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 907           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 908           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 909           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 910
 911 #define FROB_BOOLEAN_PROPERTY(prop) \
 912   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 913
 914           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 915           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 916           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 917           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 918           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 919           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 920           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 921
 922           else if (EQ (key, Qinput_charset_conversion))
 923             {
 924               codesys->iso2022.input_conv =
 925                 Dynarr_new (charset_conversion_spec);
 926               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 927                                               value);
 928             }
 929           else if (EQ (key, Qoutput_charset_conversion))
 930             {
 931               codesys->iso2022.output_conv =
 932                 Dynarr_new (charset_conversion_spec);
 933               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 934                                               value);
 935             }
 936           else
 937             signal_simple_error ("Unrecognized property", key);
 938         }
 939       else if (EQ (type, Qccl))
 940         {
 941           if (EQ (key, Qdecode))
 942             {
 943               CHECK_VECTOR (value);
 944               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 945             }
 946           else if (EQ (key, Qencode))
 947             {
 948               CHECK_VECTOR (value);
 949               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 950             }
 951           else
 952             signal_simple_error ("Unrecognized property", key);
 953         }
 954 #endif /* MULE */
 955       else
 956         signal_simple_error ("Unrecognized property", key);
 957     }
 958
 959   if (need_to_setup_eol_systems)
 960     setup_eol_coding_systems (codesys);
 961
 962   {
 963     Lisp_Object codesys_obj;
 964     XSETCODING_SYSTEM (codesys_obj, codesys);
 965     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 966     return codesys_obj;
 967   }
 968 }
 969
 970 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 971 Copy OLD-CODING-SYSTEM to NEW-NAME.
 972 If NEW-NAME does not name an existing coding system, a new one will
 973 be created.
 974 */
 975        (old_coding_system, new_name))
 976 {
 977   Lisp_Object new_coding_system;
 978   old_coding_system = Fget_coding_system (old_coding_system);
 979   new_coding_system = Ffind_coding_system (new_name);
 980   if (NILP (new_coding_system))
 981     {
 982       XSETCODING_SYSTEM (new_coding_system,
 983                          allocate_coding_system
 984                          (XCODING_SYSTEM_TYPE (old_coding_system),
 985                           new_name));
 986       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 987     }
 988
 989   {
 990     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 991     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 992     memcpy (((char *) to  ) + sizeof (to->header),
 993             ((char *) from) + sizeof (from->header),
 994             sizeof (*from) - sizeof (from->header));
 995     to->name = new_name;
 996   }
 997   return new_coding_system;
 998 }
 999
1000 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1001 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1002 */
1003        (alias, coding_system))
1004 {
1005   CHECK_SYMBOL (alias);
1006   if (!NILP (Ffind_coding_system (alias)))
1007     signal_simple_error ("Symbol already names a coding system", alias);
1008   coding_system = Fget_coding_system (coding_system);
1009   Fputhash (alias, coding_system, Vcoding_system_hash_table);
1010
1011   /* Set up aliases for subsidiaries. */
1012   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1013     {
1014       Lisp_Object str;
1015       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1016 #define FROB(type, name)                                                        \
1017       do {                                                                      \
1018         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
1019         if (!NILP (subsidiary))                                                 \
1020           Fdefine_coding_system_alias                                           \
1021             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
1022       } while (0)
1023       FROB (LF,   "-unix");
1024       FROB (CRLF, "-dos");
1025       FROB (CR,   "-mac");
1026 #undef FROB
1027     }
1028   /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1029      but it doesn't look intentional, so I'd rather return something
1030      meaningful or nothing at all. */
1031   return Qnil;
1032 }
1033
1034 static Lisp_Object
1035 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1036 {
1037   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1038   Lisp_Object new_coding_system;
1039
1040   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1041     return coding_system;
1042
1043   switch (type)
1044     {
1045     case EOL_AUTODETECT: return coding_system;
1046     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1047     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1048     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1049     default:       abort ();
1050     }
1051
1052   return NILP (new_coding_system) ? coding_system : new_coding_system;
1053 }
1054
1055 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1056 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1057 */
1058        (coding_system, eol_type))
1059 {
1060   coding_system = Fget_coding_system (coding_system);
1061
1062   return subsidiary_coding_system (coding_system,
1063                                    symbol_to_eol_type (eol_type));
1064 }
1065
1066 \f
1067 /************************************************************************/
1068 /*                         Coding system accessors                      */
1069 /************************************************************************/
1070
1071 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1072 Return the doc string for CODING-SYSTEM.
1073 */
1074        (coding_system))
1075 {
1076   coding_system = Fget_coding_system (coding_system);
1077   return XCODING_SYSTEM_DOC_STRING (coding_system);
1078 }
1079
1080 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1081 Return the type of CODING-SYSTEM.
1082 */
1083        (coding_system))
1084 {
1085   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1086     {
1087     default: abort ();
1088     case CODESYS_AUTODETECT:    return Qundecided;
1089 #ifdef MULE
1090     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1091     case CODESYS_ISO2022:       return Qiso2022;
1092     case CODESYS_BIG5:          return Qbig5;
1093     case CODESYS_UCS4:          return Qucs4;
1094     case CODESYS_UTF8:          return Qutf8;
1095     case CODESYS_CCL:           return Qccl;
1096 #endif
1097     case CODESYS_NO_CONVERSION: return Qno_conversion;
1098 #ifdef DEBUG_XEMACS
1099     case CODESYS_INTERNAL:      return Qinternal;
1100 #endif
1101     }
1102 }
1103
1104 #ifdef MULE
1105 static
1106 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1107 {
1108   Lisp_Object cs
1109     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1110
1111   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1112 }
1113
1114 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1115 Return initial charset of CODING-SYSTEM designated to GNUM.
1116 GNUM allows 0 .. 3.
1117 */
1118        (coding_system, gnum))
1119 {
1120   coding_system = Fget_coding_system (coding_system);
1121   CHECK_INT (gnum);
1122
1123   return coding_system_charset (coding_system, XINT (gnum));
1124 }
1125 #endif /* MULE */
1126
1127 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1128 Return the PROP property of CODING-SYSTEM.
1129 */
1130        (coding_system, prop))
1131 {
1132   int i, ok = 0;
1133   enum coding_system_type type;
1134
1135   coding_system = Fget_coding_system (coding_system);
1136   CHECK_SYMBOL (prop);
1137   type = XCODING_SYSTEM_TYPE (coding_system);
1138
1139   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1140     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1141       {
1142         ok = 1;
1143         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1144           {
1145           case CODESYS_PROP_ALL_OK:
1146             break;
1147 #ifdef MULE
1148           case CODESYS_PROP_ISO2022:
1149             if (type != CODESYS_ISO2022)
1150               signal_simple_error
1151                 ("Property only valid in ISO2022 coding systems",
1152                  prop);
1153             break;
1154
1155           case CODESYS_PROP_CCL:
1156             if (type != CODESYS_CCL)
1157               signal_simple_error
1158                 ("Property only valid in CCL coding systems",
1159                  prop);
1160             break;
1161 #endif /* MULE */
1162           default:
1163             abort ();
1164           }
1165       }
1166
1167   if (!ok)
1168     signal_simple_error ("Unrecognized property", prop);
1169
1170   if (EQ (prop, Qname))
1171     return XCODING_SYSTEM_NAME (coding_system);
1172   else if (EQ (prop, Qtype))
1173     return Fcoding_system_type (coding_system);
1174   else if (EQ (prop, Qdoc_string))
1175     return XCODING_SYSTEM_DOC_STRING (coding_system);
1176   else if (EQ (prop, Qmnemonic))
1177     return XCODING_SYSTEM_MNEMONIC (coding_system);
1178   else if (EQ (prop, Qeol_type))
1179     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1180   else if (EQ (prop, Qeol_lf))
1181     return XCODING_SYSTEM_EOL_LF (coding_system);
1182   else if (EQ (prop, Qeol_crlf))
1183     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1184   else if (EQ (prop, Qeol_cr))
1185     return XCODING_SYSTEM_EOL_CR (coding_system);
1186   else if (EQ (prop, Qpost_read_conversion))
1187     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1188   else if (EQ (prop, Qpre_write_conversion))
1189     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1190 #ifdef MULE
1191   else if (type == CODESYS_ISO2022)
1192     {
1193       if (EQ (prop, Qcharset_g0))
1194         return coding_system_charset (coding_system, 0);
1195       else if (EQ (prop, Qcharset_g1))
1196         return coding_system_charset (coding_system, 1);
1197       else if (EQ (prop, Qcharset_g2))
1198         return coding_system_charset (coding_system, 2);
1199       else if (EQ (prop, Qcharset_g3))
1200         return coding_system_charset (coding_system, 3);
1201
1202 #define FORCE_CHARSET(charset_num) \
1203   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1204    (coding_system, charset_num) ? Qt : Qnil)
1205
1206       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1207       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1208       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1209       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1210
1211 #define LISP_BOOLEAN(prop) \
1212   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1213
1214       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1215       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1216       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1217       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1218       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1219       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1220       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1221
1222       else if (EQ (prop, Qinput_charset_conversion))
1223         return
1224           unparse_charset_conversion_specs
1225             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1226       else if (EQ (prop, Qoutput_charset_conversion))
1227         return
1228           unparse_charset_conversion_specs
1229             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1230       else
1231         abort ();
1232     }
1233   else if (type == CODESYS_CCL)
1234     {
1235       if (EQ (prop, Qdecode))
1236         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1237       else if (EQ (prop, Qencode))
1238         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1239       else
1240         abort ();
1241     }
1242 #endif /* MULE */
1243   else
1244     abort ();
1245
1246   return Qnil; /* not reached */
1247 }
1248
1249 \f
1250 /************************************************************************/
1251 /*                       Coding category functions                      */
1252 /************************************************************************/
1253
1254 static int
1255 decode_coding_category (Lisp_Object symbol)
1256 {
1257   int i;
1258
1259   CHECK_SYMBOL (symbol);
1260   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1261     if (EQ (coding_category_symbol[i], symbol))
1262       return i;
1263
1264   signal_simple_error ("Unrecognized coding category", symbol);
1265   return 0; /* not reached */
1266 }
1267
1268 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1269 Return a list of all recognized coding categories.
1270 */
1271        ())
1272 {
1273   int i;
1274   Lisp_Object list = Qnil;
1275
1276   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1277     list = Fcons (coding_category_symbol[i], list);
1278   return list;
1279 }
1280
1281 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1282 Change the priority order of the coding categories.
1283 LIST should be list of coding categories, in descending order of
1284 priority.  Unspecified coding categories will be lower in priority
1285 than all specified ones, in the same relative order they were in
1286 previously.
1287 */
1288        (list))
1289 {
1290   int category_to_priority[CODING_CATEGORY_LAST + 1];
1291   int i, j;
1292   Lisp_Object rest;
1293
1294   /* First generate a list that maps coding categories to priorities. */
1295
1296   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1297     category_to_priority[i] = -1;
1298
1299   /* Highest priority comes from the specified list. */
1300   i = 0;
1301   EXTERNAL_LIST_LOOP (rest, list)
1302     {
1303       int cat = decode_coding_category (XCAR (rest));
1304
1305       if (category_to_priority[cat] >= 0)
1306         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1307       category_to_priority[cat] = i++;
1308     }
1309
1310   /* Now go through the existing categories by priority to retrieve
1311      the categories not yet specified and preserve their priority
1312      order. */
1313   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1314     {
1315       int cat = coding_category_by_priority[j];
1316       if (category_to_priority[cat] < 0)
1317         category_to_priority[cat] = i++;
1318     }
1319
1320   /* Now we need to construct the inverse of the mapping we just
1321      constructed. */
1322
1323   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1324     coding_category_by_priority[category_to_priority[i]] = i;
1325
1326   /* Phew!  That was confusing. */
1327   return Qnil;
1328 }
1329
1330 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1331 Return a list of coding categories in descending order of priority.
1332 */
1333        ())
1334 {
1335   int i;
1336   Lisp_Object list = Qnil;
1337
1338   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1339     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1340                   list);
1341   return list;
1342 }
1343
1344 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1345 Change the coding system associated with a coding category.
1346 */
1347        (coding_category, coding_system))
1348 {
1349   int cat = decode_coding_category (coding_category);
1350
1351   coding_system = Fget_coding_system (coding_system);
1352   coding_category_system[cat] = coding_system;
1353   return Qnil;
1354 }
1355
1356 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1357 Return the coding system associated with a coding category.
1358 */
1359        (coding_category))
1360 {
1361   int cat = decode_coding_category (coding_category);
1362   Lisp_Object sys = coding_category_system[cat];
1363
1364   if (!NILP (sys))
1365     return XCODING_SYSTEM_NAME (sys);
1366   return Qnil;
1367 }
1368
1369 \f
1370 /************************************************************************/
1371 /*                     Detecting the encoding of data                   */
1372 /************************************************************************/
1373
1374 struct detection_state
1375 {
1376   enum eol_type eol_type;
1377   int seen_non_ascii;
1378   int mask;
1379 #ifdef MULE
1380   struct
1381     {
1382       int mask;
1383       int in_second_byte;
1384     }
1385   big5;
1386
1387   struct
1388     {
1389       int mask;
1390       int in_second_byte;
1391     }
1392   shift_jis;
1393
1394   struct
1395     {
1396       int mask;
1397       int in_byte;
1398   }
1399   ucs4;
1400
1401   struct
1402     {
1403       int mask;
1404       int in_byte;
1405     }
1406   utf8;
1407
1408   struct
1409     {
1410       int mask;
1411       int initted;
1412       struct iso2022_decoder iso;
1413       unsigned int flags;
1414       int high_byte_count;
1415       unsigned int saw_single_shift:1;
1416     }
1417   iso2022;
1418 #endif
1419   struct
1420     {
1421       int seen_anything;
1422       int just_saw_cr;
1423     }
1424   eol;
1425 };
1426
1427 static int
1428 acceptable_control_char_p (int c)
1429 {
1430   switch (c)
1431     {
1432       /* Allow and ignore control characters that you might
1433          reasonably see in a text file */
1434     case '\r':
1435     case '\n':
1436     case '\t':
1437     case  7: /* bell */
1438     case  8: /* backspace */
1439     case 11: /* vertical tab */
1440     case 12: /* form feed */
1441     case 26: /* MS-DOS C-z junk */
1442     case 31: /* '^_' -- for info */
1443       return 1;
1444     default:
1445       return 0;
1446     }
1447 }
1448
1449 static int
1450 mask_has_at_most_one_bit_p (int mask)
1451 {
1452   /* Perhaps the only thing useful you learn from intensive Microsoft
1453      technical interviews */
1454   return (mask & (mask - 1)) == 0;
1455 }
1456
1457 static enum eol_type
1458 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1459                  unsigned int n)
1460 {
1461   int c;
1462
1463   while (n--)
1464     {
1465       c = *src++;
1466       if (c == '\r')
1467         st->eol.just_saw_cr = 1;
1468       else
1469         {
1470           if (c == '\n')
1471             {
1472               if (st->eol.just_saw_cr)
1473                 return EOL_CRLF;
1474               else if (st->eol.seen_anything)
1475                 return EOL_LF;
1476             }
1477           else if (st->eol.just_saw_cr)
1478             return EOL_CR;
1479           st->eol.just_saw_cr = 0;
1480         }
1481       st->eol.seen_anything = 1;
1482     }
1483
1484   return EOL_AUTODETECT;
1485 }
1486
1487 /* Attempt to determine the encoding and EOL type of the given text.
1488    Before calling this function for the first type, you must initialize
1489    st->eol_type as appropriate and initialize st->mask to ~0.
1490
1491    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1492    not yet known.
1493
1494    st->mask holds the determined coding category mask, or ~0 if only
1495    ASCII has been seen so far.
1496
1497    Returns:
1498
1499    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1500         is present in st->mask
1501    1 == definitive answers are here for both st->eol_type and st->mask
1502 */
1503
1504 static int
1505 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1506                     unsigned int n, int just_do_eol)
1507 {
1508   int c;
1509
1510   if (st->eol_type == EOL_AUTODETECT)
1511     st->eol_type = detect_eol_type (st, src, n);
1512
1513   if (just_do_eol)
1514     return st->eol_type != EOL_AUTODETECT;
1515
1516   if (!st->seen_non_ascii)
1517     {
1518       for (; n; n--, src++)
1519         {
1520           c = *src;
1521           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1522             {
1523               st->seen_non_ascii = 1;
1524 #ifdef MULE
1525               st->shift_jis.mask = ~0;
1526               st->big5.mask = ~0;
1527               st->ucs4.mask = ~0;
1528               st->utf8.mask = ~0;
1529               st->iso2022.mask = ~0;
1530 #endif
1531               break;
1532             }
1533         }
1534     }
1535
1536   if (!n)
1537     return 0;
1538 #ifdef MULE
1539   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1540     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1541   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1542     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1543   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1544     st->big5.mask = detect_coding_big5 (st, src, n);
1545   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1546     st->utf8.mask = detect_coding_utf8 (st, src, n);
1547   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1548     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1549
1550   st->mask
1551     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1552     | st->utf8.mask | st->ucs4.mask;
1553 #endif
1554   {
1555     int retval = mask_has_at_most_one_bit_p (st->mask);
1556     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1557     return retval && st->eol_type != EOL_AUTODETECT;
1558   }
1559 }
1560
1561 static Lisp_Object
1562 coding_system_from_mask (int mask)
1563 {
1564   if (mask == ~0)
1565     {
1566       /* If the file was entirely or basically ASCII, use the
1567          default value of `buffer-file-coding-system'. */
1568       Lisp_Object retval =
1569         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1570       if (!NILP (retval))
1571         {
1572           retval = Ffind_coding_system (retval);
1573           if (NILP (retval))
1574             {
1575               warn_when_safe
1576                 (Qbad_variable, Qwarning,
1577                  "Invalid `default-buffer-file-coding-system', set to nil");
1578               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1579             }
1580         }
1581       if (NILP (retval))
1582         retval = Fget_coding_system (Qraw_text);
1583       return retval;
1584     }
1585   else
1586     {
1587       int i;
1588       int cat = -1;
1589 #ifdef MULE
1590       mask = postprocess_iso2022_mask (mask);
1591 #endif
1592       /* Look through the coding categories by priority and find
1593          the first one that is allowed. */
1594       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1595         {
1596           cat = coding_category_by_priority[i];
1597           if ((mask & (1 << cat)) &&
1598               !NILP (coding_category_system[cat]))
1599             break;
1600         }
1601       if (cat >= 0)
1602         return coding_category_system[cat];
1603       else
1604         return Fget_coding_system (Qraw_text);
1605     }
1606 }
1607
1608 /* Given a seekable read stream and potential coding system and EOL type
1609    as specified, do any autodetection that is called for.  If the
1610    coding system and/or EOL type are not autodetect, they will be left
1611    alone; but this function will never return an autodetect coding system
1612    or EOL type.
1613
1614    This function does not automatically fetch subsidiary coding systems;
1615    that should be unnecessary with the explicit eol-type argument. */
1616
1617 void
1618 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1619                               enum eol_type *eol_type_in_out)
1620 {
1621   struct detection_state decst;
1622
1623   if (*eol_type_in_out == EOL_AUTODETECT)
1624     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1625
1626   xzero (decst);
1627   decst.eol_type = *eol_type_in_out;
1628   decst.mask = ~0;
1629
1630   /* If autodetection is called for, do it now. */
1631   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1632       *eol_type_in_out == EOL_AUTODETECT)
1633     {
1634       unsigned char random_buffer[4096];
1635       int nread;
1636       Lisp_Object coding_system = Qnil;
1637
1638       nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1639       if (nread)
1640         {
1641           unsigned char *cp = random_buffer;
1642
1643           while (cp < random_buffer + nread)
1644             {
1645               if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1646                   (*cp++ == 'o') && (cp < random_buffer + nread) &&
1647                   (*cp++ == 'd') && (cp < random_buffer + nread) &&
1648                   (*cp++ == 'i') && (cp < random_buffer + nread) &&
1649                   (*cp++ == 'n') && (cp < random_buffer + nread) &&
1650                   (*cp++ == 'g') && (cp < random_buffer + nread) &&
1651                   (*cp++ == ':') && (cp < random_buffer + nread))
1652                 {
1653                   unsigned char coding_system_name[4096 - 6];
1654                   unsigned char *np = coding_system_name;
1655
1656                   while ( (cp < random_buffer + nread)
1657                           && ((*cp == ' ') || (*cp == '\t')) )
1658                     {
1659                       cp++;
1660                     }
1661                   while ( (cp < random_buffer + nread) &&
1662                           (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1663                     {
1664                       *np++ = *cp++;
1665                     }
1666                   *np = 0;
1667                   coding_system
1668                     = Ffind_coding_system (intern (coding_system_name));
1669                   break;
1670                 }
1671             }
1672           if (EQ(coding_system, Qnil))
1673             do{
1674               if (detect_coding_type (&decst, random_buffer, nread,
1675                                       XCODING_SYSTEM_TYPE (*codesys_in_out)
1676                                       != CODESYS_AUTODETECT))
1677                 break;
1678               nread = Lstream_read (stream,
1679                                     random_buffer, sizeof (random_buffer));
1680               if (!nread)
1681                 break;
1682             } while(1);
1683         }
1684       *eol_type_in_out = decst.eol_type;
1685       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1686         {
1687           if (EQ(coding_system, Qnil))
1688             *codesys_in_out = coding_system_from_mask (decst.mask);
1689           else
1690             *codesys_in_out = coding_system;
1691         }
1692     }
1693   /* If we absolutely can't determine the EOL type, just assume LF. */
1694   if (*eol_type_in_out == EOL_AUTODETECT)
1695     *eol_type_in_out = EOL_LF;
1696
1697   Lstream_rewind (stream);
1698 }
1699
1700 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1701 Detect coding system of the text in the region between START and END.
1702 Returned a list of possible coding systems ordered by priority.
1703 If only ASCII characters are found, it returns 'undecided or one of
1704 its subsidiary coding systems according to a detected end-of-line
1705 type.  Optional arg BUFFER defaults to the current buffer.
1706 */
1707        (start, end, buffer))
1708 {
1709   Lisp_Object val = Qnil;
1710   struct buffer *buf = decode_buffer (buffer, 0);
1711   Bufpos b, e;
1712   Lisp_Object instream, lb_instream;
1713   Lstream *istr, *lb_istr;
1714   struct detection_state decst;
1715   struct gcpro gcpro1, gcpro2;
1716
1717   get_buffer_range_char (buf, start, end, &b, &e, 0);
1718   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1719   lb_istr = XLSTREAM (lb_instream);
1720   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1721   istr = XLSTREAM (instream);
1722   GCPRO2 (instream, lb_instream);
1723   xzero (decst);
1724   decst.eol_type = EOL_AUTODETECT;
1725   decst.mask = ~0;
1726   while (1)
1727     {
1728       unsigned char random_buffer[4096];
1729       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1730
1731       if (!nread)
1732         break;
1733       if (detect_coding_type (&decst, random_buffer, nread, 0))
1734         break;
1735     }
1736
1737   if (decst.mask == ~0)
1738     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1739                                     decst.eol_type);
1740   else
1741     {
1742       int i;
1743
1744       val = Qnil;
1745 #ifdef MULE
1746       decst.mask = postprocess_iso2022_mask (decst.mask);
1747 #endif
1748       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1749         {
1750           int sys = coding_category_by_priority[i];
1751           if (decst.mask & (1 << sys))
1752             {
1753               Lisp_Object codesys = coding_category_system[sys];
1754               if (!NILP (codesys))
1755                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1756               val = Fcons (codesys, val);
1757             }
1758         }
1759     }
1760   Lstream_close (istr);
1761   UNGCPRO;
1762   Lstream_delete (istr);
1763   Lstream_delete (lb_istr);
1764   return val;
1765 }
1766
1767 \f
1768 /************************************************************************/
1769 /*           Converting to internal Mule format ("decoding")            */
1770 /************************************************************************/
1771
1772 /* A decoding stream is a stream used for decoding text (i.e.
1773    converting from some external format to internal format).
1774    The decoding-stream object keeps track of the actual coding
1775    stream, the stream that is at the other end, and data that
1776    needs to be persistent across the lifetime of the stream. */
1777
1778 /* Handle the EOL stuff related to just-read-in character C.
1779    EOL_TYPE is the EOL type of the coding stream.
1780    FLAGS is the current value of FLAGS in the coding stream, and may
1781    be modified by this macro.  (The macro only looks at the
1782    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1783    bytes are to be written.  You need to also define a local goto
1784    label "label_continue_loop" that is at the end of the main
1785    character-reading loop.
1786
1787    If C is a CR character, then this macro handles it entirely and
1788    jumps to label_continue_loop.  Otherwise, this macro does not add
1789    anything to DST, and continues normally.  You should continue
1790    processing C normally after this macro. */
1791
1792 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1793 do {                                                            \
1794   if (c == '\r')                                                \
1795     {                                                           \
1796       if (eol_type == EOL_CR)                                   \
1797         Dynarr_add (dst, '\n');                                 \
1798       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1799         Dynarr_add (dst, c);                                    \
1800       else                                                      \
1801         flags |= CODING_STATE_CR;                               \
1802       goto label_continue_loop;                                 \
1803     }                                                           \
1804   else if (flags & CODING_STATE_CR)                             \
1805     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1806       if (c != '\n')                                            \
1807         Dynarr_add (dst, '\r');                                 \
1808       flags &= ~CODING_STATE_CR;                                \
1809     }                                                           \
1810 } while (0)
1811
1812 /* C should be a binary character in the range 0 - 255; convert
1813    to internal format and add to Dynarr DST. */
1814
1815 #ifdef UTF2000
1816 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1817 do {                                            \
1818   if (BYTE_ASCII_P (c))                         \
1819     Dynarr_add (dst, c);                        \
1820   else                                          \
1821     {                                           \
1822       Dynarr_add (dst, (c >> 6) | 0xc0);        \
1823       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
1824     }                                           \
1825 } while (0)
1826
1827 INLINE void
1828 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1829 {
1830   if ( c <= 0x7f )
1831     {
1832       Dynarr_add (dst, c);
1833     }
1834   else if ( c <= 0x7ff )
1835     {
1836       Dynarr_add (dst, (c >> 6) | 0xc0);
1837       Dynarr_add (dst, (c & 0x3f) | 0x80);
1838     }
1839   else if ( c <= 0xffff )
1840     {
1841       Dynarr_add (dst,  (c >> 12) | 0xe0);
1842       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1843       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1844     }
1845   else if ( c <= 0x1fffff )
1846     {
1847       Dynarr_add (dst,  (c >> 18) | 0xf0);
1848       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1849       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1850       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1851     }
1852   else if ( c <= 0x3ffffff )
1853     {
1854       Dynarr_add (dst,  (c >> 24) | 0xf8);
1855       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1856       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1857       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1858       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1859     }
1860   else
1861     {
1862       Dynarr_add (dst,  (c >> 30) | 0xfc);
1863       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1864       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1865       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1866       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1867       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1868     }
1869 }
1870 #else
1871 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1872 do {                                            \
1873   if (BYTE_ASCII_P (c))                         \
1874     Dynarr_add (dst, c);                        \
1875   else if (BYTE_C1_P (c))                       \
1876     {                                           \
1877       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1878       Dynarr_add (dst, c + 0x20);               \
1879     }                                           \
1880   else                                          \
1881     {                                           \
1882       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1883       Dynarr_add (dst, c);                      \
1884     }                                           \
1885 } while (0)
1886 #endif
1887
1888 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1889 do {                                    \
1890   if (ch)                               \
1891     {                                   \
1892       DECODE_ADD_BINARY_CHAR (ch, dst); \
1893       ch = 0;                           \
1894     }                                   \
1895 } while (0)
1896
1897 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1898 do {                                    \
1899   if (flags & CODING_STATE_END)         \
1900     {                                   \
1901       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1902       if (flags & CODING_STATE_CR)      \
1903         Dynarr_add (dst, '\r');         \
1904     }                                   \
1905 } while (0)
1906
1907 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1908
1909 struct decoding_stream
1910 {
1911   /* Coding system that governs the conversion. */
1912   Lisp_Coding_System *codesys;
1913
1914   /* Stream that we read the encoded data from or
1915      write the decoded data to. */
1916   Lstream *other_end;
1917
1918   /* If we are reading, then we can return only a fixed amount of
1919      data, so if the conversion resulted in too much data, we store it
1920      here for retrieval the next time around. */
1921   unsigned_char_dynarr *runoff;
1922
1923   /* FLAGS holds flags indicating the current state of the decoding.
1924      Some of these flags are dependent on the coding system. */
1925   unsigned int flags;
1926
1927   /* CH holds a partially built-up character.  Since we only deal
1928      with one- and two-byte characters at the moment, we only use
1929      this to store the first byte of a two-byte character. */
1930   unsigned int ch;
1931
1932   /* EOL_TYPE specifies the type of end-of-line conversion that
1933      currently applies.  We need to keep this separate from the
1934      EOL type stored in CODESYS because the latter might indicate
1935      automatic EOL-type detection while the former will always
1936      indicate a particular EOL type. */
1937   enum eol_type eol_type;
1938 #ifdef MULE
1939   /* Additional ISO2022 information.  We define the structure above
1940      because it's also needed by the detection routines. */
1941   struct iso2022_decoder iso2022;
1942
1943   /* Additional information (the state of the running CCL program)
1944      used by the CCL decoder. */
1945   struct ccl_program ccl;
1946
1947   /* counter for UTF-8 or UCS-4 */
1948   unsigned char counter;
1949 #endif
1950   struct detection_state decst;
1951 };
1952
1953 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1954 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1955 static int decoding_rewinder   (Lstream *stream);
1956 static int decoding_seekable_p (Lstream *stream);
1957 static int decoding_flusher    (Lstream *stream);
1958 static int decoding_closer     (Lstream *stream);
1959
1960 static Lisp_Object decoding_marker (Lisp_Object stream,
1961                                     void (*markobj) (Lisp_Object));
1962
1963 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1964                                sizeof (struct decoding_stream));
1965
1966 static Lisp_Object
1967 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1968 {
1969   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1970   Lisp_Object str_obj;
1971
1972   /* We do not need to mark the coding systems or charsets stored
1973      within the stream because they are stored in a global list
1974      and automatically marked. */
1975
1976   XSETLSTREAM (str_obj, str);
1977   markobj (str_obj);
1978   if (str->imp->marker)
1979     return (str->imp->marker) (str_obj, markobj);
1980   else
1981     return Qnil;
1982 }
1983
1984 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1985    so we read data from the other end, decode it, and store it into DATA. */
1986
1987 static int
1988 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1989 {
1990   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1991   unsigned char *orig_data = data;
1992   int read_size;
1993   int error_occurred = 0;
1994
1995   /* We need to interface to mule_decode(), which expects to take some
1996      amount of data and store the result into a Dynarr.  We have
1997      mule_decode() store into str->runoff, and take data from there
1998      as necessary. */
1999
2000   /* We loop until we have enough data, reading chunks from the other
2001      end and decoding it. */
2002   while (1)
2003     {
2004       /* Take data from the runoff if we can.  Make sure to take at
2005          most SIZE bytes, and delete the data from the runoff. */
2006       if (Dynarr_length (str->runoff) > 0)
2007         {
2008           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2009           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2010           Dynarr_delete_many (str->runoff, 0, chunk);
2011           data += chunk;
2012           size -= chunk;
2013         }
2014
2015       if (size == 0)
2016         break; /* No more room for data */
2017
2018       if (str->flags & CODING_STATE_END)
2019         /* This means that on the previous iteration, we hit the EOF on
2020            the other end.  We loop once more so that mule_decode() can
2021            output any final stuff it may be holding, or any "go back
2022            to a sane state" escape sequences. (This latter makes sense
2023            during encoding.) */
2024         break;
2025
2026       /* Exhausted the runoff, so get some more.  DATA has at least
2027          SIZE bytes left of storage in it, so it's OK to read directly
2028          into it.  (We'll be overwriting above, after we've decoded it
2029          into the runoff.) */
2030       read_size = Lstream_read (str->other_end, data, size);
2031       if (read_size < 0)
2032         {
2033           error_occurred = 1;
2034           break;
2035         }
2036       if (read_size == 0)
2037         /* There might be some more end data produced in the translation.
2038            See the comment above. */
2039         str->flags |= CODING_STATE_END;
2040       mule_decode (stream, data, str->runoff, read_size);
2041     }
2042
2043   if (data - orig_data == 0)
2044     return error_occurred ? -1 : 0;
2045   else
2046     return data - orig_data;
2047 }
2048
2049 static int
2050 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2051 {
2052   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2053   int retval;
2054
2055   /* Decode all our data into the runoff, and then attempt to write
2056      it all out to the other end.  Remove whatever chunk we succeeded
2057      in writing. */
2058   mule_decode (stream, data, str->runoff, size);
2059   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2060                           Dynarr_length (str->runoff));
2061   if (retval > 0)
2062     Dynarr_delete_many (str->runoff, 0, retval);
2063   /* Do NOT return retval.  The return value indicates how much
2064      of the incoming data was written, not how many bytes were
2065      written. */
2066   return size;
2067 }
2068
2069 static void
2070 reset_decoding_stream (struct decoding_stream *str)
2071 {
2072 #ifdef MULE
2073   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2074     {
2075       Lisp_Object coding_system;
2076       XSETCODING_SYSTEM (coding_system, str->codesys);
2077       reset_iso2022 (coding_system, &str->iso2022);
2078     }
2079   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2080     {
2081       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2082     }
2083   str->counter = 0;
2084 #endif /* MULE */
2085   str->flags = str->ch = 0;
2086 }
2087
2088 static int
2089 decoding_rewinder (Lstream *stream)
2090 {
2091   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2092   reset_decoding_stream (str);
2093   Dynarr_reset (str->runoff);
2094   return Lstream_rewind (str->other_end);
2095 }
2096
2097 static int
2098 decoding_seekable_p (Lstream *stream)
2099 {
2100   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2101   return Lstream_seekable_p (str->other_end);
2102 }
2103
2104 static int
2105 decoding_flusher (Lstream *stream)
2106 {
2107   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2108   return Lstream_flush (str->other_end);
2109 }
2110
2111 static int
2112 decoding_closer (Lstream *stream)
2113 {
2114   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2115   if (stream->flags & LSTREAM_FL_WRITE)
2116     {
2117       str->flags |= CODING_STATE_END;
2118       decoding_writer (stream, 0, 0);
2119     }
2120   Dynarr_free (str->runoff);
2121 #ifdef MULE
2122 #ifdef ENABLE_COMPOSITE_CHARS
2123   if (str->iso2022.composite_chars)
2124     Dynarr_free (str->iso2022.composite_chars);
2125 #endif
2126 #endif
2127   return Lstream_close (str->other_end);
2128 }
2129
2130 Lisp_Object
2131 decoding_stream_coding_system (Lstream *stream)
2132 {
2133   Lisp_Object coding_system;
2134   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2135
2136   XSETCODING_SYSTEM (coding_system, str->codesys);
2137   return subsidiary_coding_system (coding_system, str->eol_type);
2138 }
2139
2140 void
2141 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2142 {
2143   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2144   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2145   str->codesys = cs;
2146   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2147     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2148   reset_decoding_stream (str);
2149 }
2150
2151 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2152    stream for writing, no automatic code detection will be performed.
2153    The reason for this is that automatic code detection requires a
2154    seekable input.  Things will also fail if you open a decoding
2155    stream for reading using a non-fully-specified coding system and
2156    a non-seekable input stream. */
2157
2158 static Lisp_Object
2159 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2160                         CONST char *mode)
2161 {
2162   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2163   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2164   Lisp_Object obj;
2165
2166   xzero (*str);
2167   str->other_end = stream;
2168   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2169   str->eol_type = EOL_AUTODETECT;
2170   if (!strcmp (mode, "r")
2171       && Lstream_seekable_p (stream))
2172     /* We can determine the coding system now. */
2173     determine_real_coding_system (stream, &codesys, &str->eol_type);
2174   set_decoding_stream_coding_system (lstr, codesys);
2175   str->decst.eol_type = str->eol_type;
2176   str->decst.mask = ~0;
2177   XSETLSTREAM (obj, lstr);
2178   return obj;
2179 }
2180
2181 Lisp_Object
2182 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2183 {
2184   return make_decoding_stream_1 (stream, codesys, "r");
2185 }
2186
2187 Lisp_Object
2188 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2189 {
2190   return make_decoding_stream_1 (stream, codesys, "w");
2191 }
2192
2193 /* Note: the decode_coding_* functions all take the same
2194    arguments as mule_decode(), which is to say some SRC data of
2195    size N, which is to be stored into dynamic array DST.
2196    DECODING is the stream within which the decoding is
2197    taking place, but no data is actually read from or
2198    written to that stream; that is handled in decoding_reader()
2199    or decoding_writer().  This allows the same functions to
2200    be used for both reading and writing. */
2201
2202 static void
2203 mule_decode (Lstream *decoding, CONST unsigned char *src,
2204              unsigned_char_dynarr *dst, unsigned int n)
2205 {
2206   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2207
2208   /* If necessary, do encoding-detection now.  We do this when
2209      we're a writing stream or a non-seekable reading stream,
2210      meaning that we can't just process the whole input,
2211      rewind, and start over. */
2212
2213   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2214       str->eol_type == EOL_AUTODETECT)
2215     {
2216       Lisp_Object codesys;
2217
2218       XSETCODING_SYSTEM (codesys, str->codesys);
2219       detect_coding_type (&str->decst, src, n,
2220                           CODING_SYSTEM_TYPE (str->codesys) !=
2221                           CODESYS_AUTODETECT);
2222       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2223           str->decst.mask != ~0)
2224         /* #### This is cheesy.  What we really ought to do is
2225            buffer up a certain amount of data so as to get a
2226            less random result. */
2227         codesys = coding_system_from_mask (str->decst.mask);
2228       str->eol_type = str->decst.eol_type;
2229       if (XCODING_SYSTEM (codesys) != str->codesys)
2230         {
2231           /* Preserve the CODING_STATE_END flag in case it was set.
2232              If we erase it, bad things might happen. */
2233           int was_end = str->flags & CODING_STATE_END;
2234           set_decoding_stream_coding_system (decoding, codesys);
2235           if (was_end)
2236             str->flags |= CODING_STATE_END;
2237         }
2238     }
2239
2240   switch (CODING_SYSTEM_TYPE (str->codesys))
2241     {
2242 #ifdef DEBUG_XEMACS
2243     case CODESYS_INTERNAL:
2244       Dynarr_add_many (dst, src, n);
2245       break;
2246 #endif
2247     case CODESYS_AUTODETECT:
2248       /* If we got this far and still haven't decided on the coding
2249          system, then do no conversion. */
2250     case CODESYS_NO_CONVERSION:
2251       decode_coding_no_conversion (decoding, src, dst, n);
2252       break;
2253 #ifdef MULE
2254     case CODESYS_SHIFT_JIS:
2255       decode_coding_sjis (decoding, src, dst, n);
2256       break;
2257     case CODESYS_BIG5:
2258       decode_coding_big5 (decoding, src, dst, n);
2259       break;
2260     case CODESYS_UCS4:
2261       decode_coding_ucs4 (decoding, src, dst, n);
2262       break;
2263     case CODESYS_UTF8:
2264       decode_coding_utf8 (decoding, src, dst, n);
2265       break;
2266     case CODESYS_CCL:
2267       str->ccl.last_block = str->flags & CODING_STATE_END;
2268       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2269       break;
2270     case CODESYS_ISO2022:
2271       decode_coding_iso2022 (decoding, src, dst, n);
2272       break;
2273 #endif /* MULE */
2274     default:
2275       abort ();
2276     }
2277 }
2278
2279 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2280 Decode the text between START and END which is encoded in CODING-SYSTEM.
2281 This is useful if you've read in encoded text from a file without decoding
2282 it (e.g. you read in a JIS-formatted file but used the `binary' or
2283 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2284 Return length of decoded text.
2285 BUFFER defaults to the current buffer if unspecified.
2286 */
2287        (start, end, coding_system, buffer))
2288 {
2289   Bufpos b, e;
2290   struct buffer *buf = decode_buffer (buffer, 0);
2291   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2292   Lstream *istr, *ostr;
2293   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2294
2295   get_buffer_range_char (buf, start, end, &b, &e, 0);
2296
2297   barf_if_buffer_read_only (buf, b, e);
2298
2299   coding_system = Fget_coding_system (coding_system);
2300   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2301   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2302   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2303                                               coding_system);
2304   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2305                                            Fget_coding_system (Qbinary));
2306   istr = XLSTREAM (instream);
2307   ostr = XLSTREAM (outstream);
2308   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2309
2310   /* The chain of streams looks like this:
2311
2312      [BUFFER] <----- send through
2313                      ------> [ENCODE AS BINARY]
2314                              ------> [DECODE AS SPECIFIED]
2315                                      ------> [BUFFER]
2316    */
2317
2318   while (1)
2319     {
2320       char tempbuf[1024]; /* some random amount */
2321       Bufpos newpos, even_newer_pos;
2322       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2323       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2324
2325       if (!size_in_bytes)
2326         break;
2327       newpos = lisp_buffer_stream_startpos (istr);
2328       Lstream_write (ostr, tempbuf, size_in_bytes);
2329       even_newer_pos = lisp_buffer_stream_startpos (istr);
2330       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2331                            even_newer_pos, 0);
2332     }
2333   Lstream_close (istr);
2334   Lstream_close (ostr);
2335   UNGCPRO;
2336   Lstream_delete (istr);
2337   Lstream_delete (ostr);
2338   Lstream_delete (XLSTREAM (de_outstream));
2339   Lstream_delete (XLSTREAM (lb_outstream));
2340   return Qnil;
2341 }
2342
2343 \f
2344 /************************************************************************/
2345 /*           Converting to an external encoding ("encoding")            */
2346 /************************************************************************/
2347
2348 /* An encoding stream is an output stream.  When you create the
2349    stream, you specify the coding system that governs the encoding
2350    and another stream that the resulting encoded data is to be
2351    sent to, and then start sending data to it. */
2352
2353 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2354
2355 struct encoding_stream
2356 {
2357   /* Coding system that governs the conversion. */
2358   Lisp_Coding_System *codesys;
2359
2360   /* Stream that we read the encoded data from or
2361      write the decoded data to. */
2362   Lstream *other_end;
2363
2364   /* If we are reading, then we can return only a fixed amount of
2365      data, so if the conversion resulted in too much data, we store it
2366      here for retrieval the next time around. */
2367   unsigned_char_dynarr *runoff;
2368
2369   /* FLAGS holds flags indicating the current state of the encoding.
2370      Some of these flags are dependent on the coding system. */
2371   unsigned int flags;
2372
2373   /* CH holds a partially built-up character.  Since we only deal
2374      with one- and two-byte characters at the moment, we only use
2375      this to store the first byte of a two-byte character. */
2376   unsigned int ch;
2377 #ifdef MULE
2378   /* Additional information used by the ISO2022 encoder. */
2379   struct
2380     {
2381       /* CHARSET holds the character sets currently assigned to the G0
2382          through G3 registers.  It is initialized from the array
2383          INITIAL_CHARSET in CODESYS. */
2384       Lisp_Object charset[4];
2385
2386       /* Which registers are currently invoked into the left (GL) and
2387          right (GR) halves of the 8-bit encoding space? */
2388       int register_left, register_right;
2389
2390       /* Whether we need to explicitly designate the charset in the
2391          G? register before using it.  It is initialized from the
2392          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2393       unsigned char force_charset_on_output[4];
2394
2395       /* Other state variables that need to be preserved across
2396          invocations. */
2397       Lisp_Object current_charset;
2398       int current_half;
2399       int current_char_boundary;
2400     } iso2022;
2401
2402   void (*encode_char) (struct encoding_stream *str, Emchar c,
2403                        unsigned_char_dynarr *dst, unsigned int *flags);
2404   void (*finish) (struct encoding_stream *str,
2405                   unsigned_char_dynarr *dst, unsigned int *flags);
2406
2407   /* Additional information (the state of the running CCL program)
2408      used by the CCL encoder. */
2409   struct ccl_program ccl;
2410 #endif /* MULE */
2411 };
2412
2413 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2414 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2415                             size_t size);
2416 static int encoding_rewinder   (Lstream *stream);
2417 static int encoding_seekable_p (Lstream *stream);
2418 static int encoding_flusher    (Lstream *stream);
2419 static int encoding_closer     (Lstream *stream);
2420
2421 static Lisp_Object encoding_marker (Lisp_Object stream,
2422                                     void (*markobj) (Lisp_Object));
2423
2424 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2425                                sizeof (struct encoding_stream));
2426
2427 static Lisp_Object
2428 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2429 {
2430   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2431   Lisp_Object str_obj;
2432
2433   /* We do not need to mark the coding systems or charsets stored
2434      within the stream because they are stored in a global list
2435      and automatically marked. */
2436
2437   XSETLSTREAM (str_obj, str);
2438   markobj (str_obj);
2439   if (str->imp->marker)
2440     return (str->imp->marker) (str_obj, markobj);
2441   else
2442     return Qnil;
2443 }
2444
2445 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2446    so we read data from the other end, encode it, and store it into DATA. */
2447
2448 static int
2449 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2450 {
2451   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2452   unsigned char *orig_data = data;
2453   int read_size;
2454   int error_occurred = 0;
2455
2456   /* We need to interface to mule_encode(), which expects to take some
2457      amount of data and store the result into a Dynarr.  We have
2458      mule_encode() store into str->runoff, and take data from there
2459      as necessary. */
2460
2461   /* We loop until we have enough data, reading chunks from the other
2462      end and encoding it. */
2463   while (1)
2464     {
2465       /* Take data from the runoff if we can.  Make sure to take at
2466          most SIZE bytes, and delete the data from the runoff. */
2467       if (Dynarr_length (str->runoff) > 0)
2468         {
2469           int chunk = min ((int) size, Dynarr_length (str->runoff));
2470           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2471           Dynarr_delete_many (str->runoff, 0, chunk);
2472           data += chunk;
2473           size -= chunk;
2474         }
2475
2476       if (size == 0)
2477         break; /* No more room for data */
2478
2479       if (str->flags & CODING_STATE_END)
2480         /* This means that on the previous iteration, we hit the EOF on
2481            the other end.  We loop once more so that mule_encode() can
2482            output any final stuff it may be holding, or any "go back
2483            to a sane state" escape sequences. (This latter makes sense
2484            during encoding.) */
2485         break;
2486
2487       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2488          left of storage in it, so it's OK to read directly into it.
2489          (We'll be overwriting above, after we've encoded it into the
2490          runoff.) */
2491       read_size = Lstream_read (str->other_end, data, size);
2492       if (read_size < 0)
2493         {
2494           error_occurred = 1;
2495           break;
2496         }
2497       if (read_size == 0)
2498         /* There might be some more end data produced in the translation.
2499            See the comment above. */
2500         str->flags |= CODING_STATE_END;
2501       mule_encode (stream, data, str->runoff, read_size);
2502     }
2503
2504   if (data == orig_data)
2505     return error_occurred ? -1 : 0;
2506   else
2507     return data - orig_data;
2508 }
2509
2510 static int
2511 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2512 {
2513   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2514   int retval;
2515
2516   /* Encode all our data into the runoff, and then attempt to write
2517      it all out to the other end.  Remove whatever chunk we succeeded
2518      in writing. */
2519   mule_encode (stream, data, str->runoff, size);
2520   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2521                           Dynarr_length (str->runoff));
2522   if (retval > 0)
2523     Dynarr_delete_many (str->runoff, 0, retval);
2524   /* Do NOT return retval.  The return value indicates how much
2525      of the incoming data was written, not how many bytes were
2526      written. */
2527   return size;
2528 }
2529
2530 static void
2531 reset_encoding_stream (struct encoding_stream *str)
2532 {
2533 #ifdef MULE
2534   switch (CODING_SYSTEM_TYPE (str->codesys))
2535     {
2536     case CODESYS_ISO2022:
2537       {
2538         int i;
2539
2540         str->encode_char = &char_encode_iso2022;
2541         str->finish = &char_finish_iso2022;
2542         for (i = 0; i < 4; i++)
2543           {
2544             str->iso2022.charset[i] =
2545               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2546             str->iso2022.force_charset_on_output[i] =
2547               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2548           }
2549         str->iso2022.register_left = 0;
2550         str->iso2022.register_right = 1;
2551         str->iso2022.current_charset = Qnil;
2552         str->iso2022.current_half = 0;
2553         break;
2554       }
2555     case CODESYS_CCL:
2556       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2557       break;
2558     case CODESYS_UTF8:
2559       str->encode_char = &char_encode_utf8;
2560       str->finish = &char_finish_utf8;
2561       break;
2562     case CODESYS_UCS4:
2563       str->encode_char = &char_encode_ucs4;
2564       str->finish = &char_finish_ucs4;
2565       break;
2566     case CODESYS_SHIFT_JIS:
2567       str->encode_char = &char_encode_shift_jis;
2568       str->finish = &char_finish_shift_jis;
2569       break;
2570     default:
2571       break;
2572     }
2573 #endif /* MULE */
2574   str->iso2022.current_char_boundary = 0;
2575   str->flags = str->ch = 0;
2576 }
2577
2578 static int
2579 encoding_rewinder (Lstream *stream)
2580 {
2581   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2582   reset_encoding_stream (str);
2583   Dynarr_reset (str->runoff);
2584   return Lstream_rewind (str->other_end);
2585 }
2586
2587 static int
2588 encoding_seekable_p (Lstream *stream)
2589 {
2590   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2591   return Lstream_seekable_p (str->other_end);
2592 }
2593
2594 static int
2595 encoding_flusher (Lstream *stream)
2596 {
2597   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2598   return Lstream_flush (str->other_end);
2599 }
2600
2601 static int
2602 encoding_closer (Lstream *stream)
2603 {
2604   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2605   if (stream->flags & LSTREAM_FL_WRITE)
2606     {
2607       str->flags |= CODING_STATE_END;
2608       encoding_writer (stream, 0, 0);
2609     }
2610   Dynarr_free (str->runoff);
2611   return Lstream_close (str->other_end);
2612 }
2613
2614 Lisp_Object
2615 encoding_stream_coding_system (Lstream *stream)
2616 {
2617   Lisp_Object coding_system;
2618   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2619
2620   XSETCODING_SYSTEM (coding_system, str->codesys);
2621   return coding_system;
2622 }
2623
2624 void
2625 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2626 {
2627   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2628   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2629   str->codesys = cs;
2630   reset_encoding_stream (str);
2631 }
2632
2633 static Lisp_Object
2634 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2635                         CONST char *mode)
2636 {
2637   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2638   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2639   Lisp_Object obj;
2640
2641   xzero (*str);
2642   str->runoff = Dynarr_new (unsigned_char);
2643   str->other_end = stream;
2644   set_encoding_stream_coding_system (lstr, codesys);
2645   XSETLSTREAM (obj, lstr);
2646   return obj;
2647 }
2648
2649 Lisp_Object
2650 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2651 {
2652   return make_encoding_stream_1 (stream, codesys, "r");
2653 }
2654
2655 Lisp_Object
2656 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2657 {
2658   return make_encoding_stream_1 (stream, codesys, "w");
2659 }
2660
2661 /* Convert N bytes of internally-formatted data stored in SRC to an
2662    external format, according to the encoding stream ENCODING.
2663    Store the encoded data into DST. */
2664
2665 static void
2666 mule_encode (Lstream *encoding, CONST unsigned char *src,
2667              unsigned_char_dynarr *dst, unsigned int n)
2668 {
2669   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2670
2671   switch (CODING_SYSTEM_TYPE (str->codesys))
2672     {
2673 #ifdef DEBUG_XEMACS
2674     case CODESYS_INTERNAL:
2675       Dynarr_add_many (dst, src, n);
2676       break;
2677 #endif
2678     case CODESYS_AUTODETECT:
2679       /* If we got this far and still haven't decided on the coding
2680          system, then do no conversion. */
2681     case CODESYS_NO_CONVERSION:
2682       encode_coding_no_conversion (encoding, src, dst, n);
2683       break;
2684 #ifdef MULE
2685     case CODESYS_BIG5:
2686       encode_coding_big5 (encoding, src, dst, n);
2687       break;
2688     case CODESYS_CCL:
2689       str->ccl.last_block = str->flags & CODING_STATE_END;
2690       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2691       break;
2692 #endif /* MULE */
2693     default:
2694       text_encode_generic (encoding, src, dst, n);
2695     }
2696 }
2697
2698 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2699 Encode the text between START and END using CODING-SYSTEM.
2700 This will, for example, convert Japanese characters into stuff such as
2701 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2702 text.  BUFFER defaults to the current buffer if unspecified.
2703 */
2704        (start, end, coding_system, buffer))
2705 {
2706   Bufpos b, e;
2707   struct buffer *buf = decode_buffer (buffer, 0);
2708   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2709   Lstream *istr, *ostr;
2710   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2711
2712   get_buffer_range_char (buf, start, end, &b, &e, 0);
2713
2714   barf_if_buffer_read_only (buf, b, e);
2715
2716   coding_system = Fget_coding_system (coding_system);
2717   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2718   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2719   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2720                                               Fget_coding_system (Qbinary));
2721   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2722                                            coding_system);
2723   istr = XLSTREAM (instream);
2724   ostr = XLSTREAM (outstream);
2725   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2726   /* The chain of streams looks like this:
2727
2728      [BUFFER] <----- send through
2729                      ------> [ENCODE AS SPECIFIED]
2730                              ------> [DECODE AS BINARY]
2731                                      ------> [BUFFER]
2732    */
2733   while (1)
2734     {
2735       char tempbuf[1024]; /* some random amount */
2736       Bufpos newpos, even_newer_pos;
2737       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2738       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2739
2740       if (!size_in_bytes)
2741         break;
2742       newpos = lisp_buffer_stream_startpos (istr);
2743       Lstream_write (ostr, tempbuf, size_in_bytes);
2744       even_newer_pos = lisp_buffer_stream_startpos (istr);
2745       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2746                            even_newer_pos, 0);
2747     }
2748
2749   {
2750     Charcount retlen =
2751       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2752     Lstream_close (istr);
2753     Lstream_close (ostr);
2754     UNGCPRO;
2755     Lstream_delete (istr);
2756     Lstream_delete (ostr);
2757     Lstream_delete (XLSTREAM (de_outstream));
2758     Lstream_delete (XLSTREAM (lb_outstream));
2759     return make_int (retlen);
2760   }
2761 }
2762
2763 #ifdef MULE
2764 \f
2765 static void
2766 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2767                      unsigned_char_dynarr *dst, unsigned int n)
2768 {
2769   unsigned char c;
2770   unsigned char char_boundary;
2771   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2772   unsigned int flags          = str->flags;
2773   Emchar ch                   = str->ch;
2774
2775   char_boundary = str->iso2022.current_char_boundary;
2776
2777   while (n--)
2778     {
2779       c = *src++;
2780
2781       if (char_boundary == 0)
2782         {
2783           if (c >= 0xfc)
2784             {
2785               ch = c & 0x01;
2786               char_boundary = 5;
2787             }
2788           else if (c >= 0xf8)
2789             {
2790               ch = c & 0x03;
2791               char_boundary = 4;
2792             }
2793           else if (c >= 0xf0)
2794             {
2795               ch = c & 0x07;
2796               char_boundary = 3;
2797             }
2798           else if (c >= 0xe0)
2799             {
2800               ch = c & 0x0f;
2801               char_boundary = 2;
2802             }
2803           else if (c >= 0xc0)
2804             {
2805               ch = c & 0x1f;
2806               char_boundary = 1;
2807             }
2808           else
2809             (*str->encode_char) (str, c, dst, &flags);
2810         }
2811       else if (char_boundary == 1)
2812         {
2813           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2814           ch =0;
2815           char_boundary = 0;
2816         }
2817       else
2818         {
2819           ch = (ch << 6) | (c & 0x3f);
2820           char_boundary--;
2821         }
2822     }
2823
2824   if ((char_boundary == 0) && (flags & CODING_STATE_END))
2825     {
2826       (*str->finish) (str, dst, &flags);
2827     }
2828
2829   str->flags = flags;
2830   str->ch    = ch;
2831   str->iso2022.current_char_boundary = char_boundary;
2832 }
2833
2834 \f
2835 /************************************************************************/
2836 /*                          Shift-JIS methods                           */
2837 /************************************************************************/
2838
2839 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2840    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2841    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2842    encoded by "position-code + 0x80".  A character of JISX0208
2843    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2844    position-codes are divided and shifted so that it fit in the range
2845    below.
2846
2847    --- CODE RANGE of Shift-JIS ---
2848    (character set)      (range)
2849    ASCII                0x00 .. 0x7F
2850    JISX0201-Kana        0xA0 .. 0xDF
2851    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2852             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2853    -------------------------------
2854
2855 */
2856
2857 /* Is this the first byte of a Shift-JIS two-byte char? */
2858
2859 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2860   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2861
2862 /* Is this the second byte of a Shift-JIS two-byte char? */
2863
2864 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2865   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2866
2867 #define BYTE_SJIS_KATAKANA_P(c) \
2868   ((c) >= 0xA1 && (c) <= 0xDF)
2869
2870 static int
2871 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2872                     unsigned int n)
2873 {
2874   int c;
2875
2876   while (n--)
2877     {
2878       c = *src++;
2879       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2880         return 0;
2881       if (st->shift_jis.in_second_byte)
2882         {
2883           st->shift_jis.in_second_byte = 0;
2884           if (c < 0x40)
2885             return 0;
2886         }
2887       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2888         st->shift_jis.in_second_byte = 1;
2889     }
2890   return CODING_CATEGORY_SHIFT_JIS_MASK;
2891 }
2892
2893 /* Convert Shift-JIS data to internal format. */
2894
2895 static void
2896 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2897                     unsigned_char_dynarr *dst, unsigned int n)
2898 {
2899   unsigned char c;
2900   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2901   unsigned int flags  = str->flags;
2902   unsigned int ch     = str->ch;
2903   eol_type_t eol_type = str->eol_type;
2904
2905   while (n--)
2906     {
2907       c = *src++;
2908
2909       if (ch)
2910         {
2911           /* Previous character was first byte of Shift-JIS Kanji char. */
2912           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2913             {
2914               unsigned char e1, e2;
2915
2916               DECODE_SJIS (ch, c, e1, e2);
2917 #ifdef UTF2000
2918               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2919                                             e1 & 0x7F,
2920                                             e2 & 0x7F), dst);
2921 #else
2922               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2923               Dynarr_add (dst, e1);
2924               Dynarr_add (dst, e2);
2925 #endif
2926             }
2927           else
2928             {
2929               DECODE_ADD_BINARY_CHAR (ch, dst);
2930               DECODE_ADD_BINARY_CHAR (c, dst);
2931             }
2932           ch = 0;
2933         }
2934       else
2935         {
2936           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2937           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2938             ch = c;
2939           else if (BYTE_SJIS_KATAKANA_P (c))
2940             {
2941 #ifdef UTF2000
2942               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2943                                             c & 0x7F, 0), dst);
2944 #else
2945               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2946               Dynarr_add (dst, c);
2947 #endif
2948             }
2949 #ifdef UTF2000
2950           else if (c > 32)
2951             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
2952                                           c, 0), dst);
2953 #endif
2954           else
2955             DECODE_ADD_BINARY_CHAR (c, dst);
2956         }
2957     label_continue_loop:;
2958     }
2959
2960   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2961
2962   str->flags = flags;
2963   str->ch    = ch;
2964 }
2965
2966 /* Convert internal character representation to Shift_JIS. */
2967
2968 void
2969 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
2970                        unsigned_char_dynarr *dst, unsigned int *flags)
2971 {
2972   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2973
2974   if (ch == '\n')
2975     {
2976       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2977         Dynarr_add (dst, '\r');
2978       if (eol_type != EOL_CR)
2979         Dynarr_add (dst, ch);
2980     }
2981   else
2982     {
2983       Lisp_Object charset;
2984       unsigned int c1, c2, s1, s2;
2985 #ifdef UTF2000
2986       Lisp_Object value = charset_code_point (Vcharset_latin_jisx0201, ch);
2987       Lisp_Object ret = Fcar (value);
2988
2989       if (INTP (ret))
2990         {
2991           charset = Vcharset_latin_jisx0201;
2992           c1 = XINT (ret);
2993           c2 = 0;
2994         }
2995       else
2996 #endif
2997         BREAKUP_CHAR (ch, charset, c1, c2);
2998
2999       if (EQ(charset, Vcharset_katakana_jisx0201))
3000         {
3001           Dynarr_add (dst, c1 | 0x80);
3002         }
3003       else if (c2 == 0)
3004         {
3005           Dynarr_add (dst, c1);
3006         }
3007       else if (EQ(charset, Vcharset_japanese_jisx0208))
3008         {
3009           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3010           Dynarr_add (dst, s1);
3011           Dynarr_add (dst, s2);
3012         }
3013       else
3014         Dynarr_add (dst, '?');
3015     }
3016 }
3017
3018 void
3019 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3020                        unsigned int *flags)
3021 {
3022 }
3023
3024 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3025 Decode a JISX0208 character of Shift-JIS coding-system.
3026 CODE is the character code in Shift-JIS as a cons of type bytes.
3027 Return the corresponding character.
3028 */
3029        (code))
3030 {
3031   unsigned char c1, c2, s1, s2;
3032
3033   CHECK_CONS (code);
3034   CHECK_INT (XCAR (code));
3035   CHECK_INT (XCDR (code));
3036   s1 = XINT (XCAR (code));
3037   s2 = XINT (XCDR (code));
3038   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3039       BYTE_SJIS_TWO_BYTE_2_P (s2))
3040     {
3041       DECODE_SJIS (s1, s2, c1, c2);
3042       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3043                                    c1 & 0x7F, c2 & 0x7F));
3044     }
3045   else
3046     return Qnil;
3047 }
3048
3049 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3050 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3051 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3052 */
3053        (ch))
3054 {
3055   Lisp_Object charset;
3056   int c1, c2, s1, s2;
3057
3058   CHECK_CHAR_COERCE_INT (ch);
3059   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3060   if (EQ (charset, Vcharset_japanese_jisx0208))
3061     {
3062       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3063       return Fcons (make_int (s1), make_int (s2));
3064     }
3065   else
3066     return Qnil;
3067 }
3068
3069 \f
3070 /************************************************************************/
3071 /*                            Big5 methods                              */
3072 /************************************************************************/
3073
3074 /* BIG5 is a coding system encoding two character sets: ASCII and
3075    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3076    character set and is encoded in two-byte.
3077
3078    --- CODE RANGE of BIG5 ---
3079    (character set)      (range)
3080    ASCII                0x00 .. 0x7F
3081    Big5 (1st byte)      0xA1 .. 0xFE
3082         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3083    --------------------------
3084
3085    Since the number of characters in Big5 is larger than maximum
3086    characters in Emacs' charset (96x96), it can't be handled as one
3087    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3088    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3089    contains frequently used characters and the latter contains less
3090    frequently used characters.  */
3091
3092 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3093   ((c) >= 0xA1 && (c) <= 0xFE)
3094
3095 /* Is this the second byte of a Shift-JIS two-byte char? */
3096
3097 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3098   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3099
3100 /* Number of Big5 characters which have the same code in 1st byte.  */
3101
3102 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3103
3104 /* Code conversion macros.  These are macros because they are used in
3105    inner loops during code conversion.
3106
3107    Note that temporary variables in macros introduce the classic
3108    dynamic-scoping problems with variable names.  We use capital-
3109    lettered variables in the assumption that XEmacs does not use
3110    capital letters in variables except in a very formalized way
3111    (e.g. Qstring). */
3112
3113 /* Convert Big5 code (b1, b2) into its internal string representation
3114    (lb, c1, c2). */
3115
3116 /* There is a much simpler way to split the Big5 charset into two.
3117    For the moment I'm going to leave the algorithm as-is because it
3118    claims to separate out the most-used characters into a single
3119    charset, which perhaps will lead to optimizations in various
3120    places.
3121
3122    The way the algorithm works is something like this:
3123
3124    Big5 can be viewed as a 94x157 charset, where the row is
3125    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3126    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3127    the split between low and high column numbers is apparently
3128    meaningless; ascending rows produce less and less frequent chars.
3129    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3130    the first charset, and the upper half (0xC9 .. 0xFE) to the
3131    second.  To do the conversion, we convert the character into
3132    a single number where 0 .. 156 is the first row, 157 .. 313
3133    is the second, etc.  That way, the characters are ordered by
3134    decreasing frequency.  Then we just chop the space in two
3135    and coerce the result into a 94x94 space.
3136    */
3137
3138 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3139 {                                                                       \
3140   int B1 = b1, B2 = b2;                                                 \
3141   unsigned int I                                                        \
3142     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3143                                                                         \
3144   if (B1 < 0xC9)                                                        \
3145     {                                                                   \
3146       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3147     }                                                                   \
3148   else                                                                  \
3149     {                                                                   \
3150       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3151       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3152     }                                                                   \
3153   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3154   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3155 } while (0)
3156
3157 /* Convert the internal string representation of a Big5 character
3158    (lb, c1, c2) into Big5 code (b1, b2). */
3159
3160 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3161 {                                                                       \
3162   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3163                                                                         \
3164   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3165     {                                                                   \
3166       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3167     }                                                                   \
3168   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3169   b2 = I % BIG5_SAME_ROW;                                               \
3170   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3171 } while (0)
3172
3173 static int
3174 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3175                     unsigned int n)
3176 {
3177   int c;
3178
3179   while (n--)
3180     {
3181       c = *src++;
3182       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3183           (c >= 0x80 && c <= 0xA0))
3184         return 0;
3185       if (st->big5.in_second_byte)
3186         {
3187           st->big5.in_second_byte = 0;
3188           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3189             return 0;
3190         }
3191       else if (c >= 0xA1)
3192         st->big5.in_second_byte = 1;
3193     }
3194   return CODING_CATEGORY_BIG5_MASK;
3195 }
3196
3197 /* Convert Big5 data to internal format. */
3198
3199 static void
3200 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3201                     unsigned_char_dynarr *dst, unsigned int n)
3202 {
3203   unsigned char c;
3204   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3205   unsigned int flags  = str->flags;
3206   unsigned int ch     = str->ch;
3207   eol_type_t eol_type = str->eol_type;
3208
3209   while (n--)
3210     {
3211       c = *src++;
3212       if (ch)
3213         {
3214           /* Previous character was first byte of Big5 char. */
3215           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3216             {
3217               unsigned char b1, b2, b3;
3218               DECODE_BIG5 (ch, c, b1, b2, b3);
3219               Dynarr_add (dst, b1);
3220               Dynarr_add (dst, b2);
3221               Dynarr_add (dst, b3);
3222             }
3223           else
3224             {
3225               DECODE_ADD_BINARY_CHAR (ch, dst);
3226               DECODE_ADD_BINARY_CHAR (c, dst);
3227             }
3228           ch = 0;
3229         }
3230       else
3231         {
3232           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3233           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3234             ch = c;
3235           else
3236             DECODE_ADD_BINARY_CHAR (c, dst);
3237         }
3238     label_continue_loop:;
3239     }
3240
3241   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3242
3243   str->flags = flags;
3244   str->ch    = ch;
3245 }
3246
3247 /* Convert internally-formatted data to Big5. */
3248
3249 static void
3250 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3251                     unsigned_char_dynarr *dst, unsigned int n)
3252 {
3253 #ifndef UTF2000
3254   unsigned char c;
3255   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3256   unsigned int flags  = str->flags;
3257   unsigned int ch     = str->ch;
3258   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3259
3260   while (n--)
3261     {
3262       c = *src++;
3263       if (c == '\n')
3264         {
3265           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3266             Dynarr_add (dst, '\r');
3267           if (eol_type != EOL_CR)
3268             Dynarr_add (dst, '\n');
3269         }
3270       else if (BYTE_ASCII_P (c))
3271         {
3272           /* ASCII. */
3273           Dynarr_add (dst, c);
3274         }
3275       else if (BUFBYTE_LEADING_BYTE_P (c))
3276         {
3277           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3278               c == LEADING_BYTE_CHINESE_BIG5_2)
3279             {
3280               /* A recognized leading byte. */
3281               ch = c;
3282               continue; /* not done with this character. */
3283             }
3284           /* otherwise just ignore this character. */
3285         }
3286       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3287                ch == LEADING_BYTE_CHINESE_BIG5_2)
3288         {
3289           /* Previous char was a recognized leading byte. */
3290           ch = (ch << 8) | c;
3291           continue; /* not done with this character. */
3292         }
3293       else if (ch)
3294         {
3295           /* Encountering second byte of a Big5 character. */
3296           unsigned char b1, b2;
3297
3298           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3299           Dynarr_add (dst, b1);
3300           Dynarr_add (dst, b2);
3301         }
3302
3303       ch = 0;
3304     }
3305
3306   str->flags = flags;
3307   str->ch    = ch;
3308 #endif
3309 }
3310
3311
3312 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3313 Decode a Big5 character CODE of BIG5 coding-system.
3314 CODE is the character code in BIG5, a cons of two integers.
3315 Return the corresponding character.
3316 */
3317        (code))
3318 {
3319   unsigned char c1, c2, b1, b2;
3320
3321   CHECK_CONS (code);
3322   CHECK_INT (XCAR (code));
3323   CHECK_INT (XCDR (code));
3324   b1 = XINT (XCAR (code));
3325   b2 = XINT (XCDR (code));
3326   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3327       BYTE_BIG5_TWO_BYTE_2_P (b2))
3328     {
3329       Charset_ID leading_byte;
3330       Lisp_Object charset;
3331       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3332       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3333       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3334     }
3335   else
3336     return Qnil;
3337 }
3338
3339 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3340 Encode the Big5 character CH to BIG5 coding-system.
3341 Return the corresponding character code in Big5.
3342 */
3343        (ch))
3344 {
3345   Lisp_Object charset;
3346   int c1, c2, b1, b2;
3347
3348   CHECK_CHAR_COERCE_INT (ch);
3349   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3350   if (EQ (charset, Vcharset_chinese_big5_1) ||
3351       EQ (charset, Vcharset_chinese_big5_2))
3352     {
3353       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3354                    b1, b2);
3355       return Fcons (make_int (b1), make_int (b2));
3356     }
3357   else
3358     return Qnil;
3359 }
3360
3361 \f
3362 /************************************************************************/
3363 /*                           UCS-4 methods                              */
3364 /************************************************************************/
3365
3366 static int
3367 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3368                     unsigned int n)
3369 {
3370   while (n--)
3371     {
3372       int c = *src++;
3373       switch (st->ucs4.in_byte)
3374         {
3375         case 0:
3376           if (c >= 128)
3377             return 0;
3378           else
3379             st->ucs4.in_byte++;
3380           break;
3381         case 3:
3382           st->ucs4.in_byte = 0;
3383           break;
3384         default:
3385           st->ucs4.in_byte++;
3386         }
3387     }
3388   return CODING_CATEGORY_UCS4_MASK;
3389 }
3390
3391 static void
3392 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3393                     unsigned_char_dynarr *dst, unsigned int n)
3394 {
3395   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3396   unsigned int flags = str->flags;
3397   unsigned int ch    = str->ch;
3398   unsigned char counter = str->counter;
3399
3400   while (n--)
3401     {
3402       unsigned char c = *src++;
3403       switch (counter)
3404         {
3405         case 0:
3406           ch = c;
3407           counter = 3;
3408           break;
3409         case 1:
3410           DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3411           ch = 0;
3412           counter = 0;
3413           break;
3414         default:
3415           ch = ( ch << 8 ) | c;
3416           counter--;
3417         }
3418     }
3419   if (counter & CODING_STATE_END)
3420     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3421
3422   str->flags = flags;
3423   str->ch    = ch;
3424   str->counter = counter;
3425 }
3426
3427 void
3428 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3429                   unsigned_char_dynarr *dst, unsigned int *flags)
3430 {
3431   Dynarr_add (dst, ch >> 24);
3432   Dynarr_add (dst, ch >> 16);
3433   Dynarr_add (dst, ch >>  8);
3434   Dynarr_add (dst, ch      );
3435 }
3436
3437 void
3438 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3439                   unsigned int *flags)
3440 {
3441 }
3442
3443 \f
3444 /************************************************************************/
3445 /*                           UTF-8 methods                              */
3446 /************************************************************************/
3447
3448 static int
3449 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3450                     unsigned int n)
3451 {
3452   while (n--)
3453     {
3454       unsigned char c = *src++;
3455       switch (st->utf8.in_byte)
3456         {
3457         case 0:
3458           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3459             return 0;
3460           else if (c >= 0xfc)
3461             st->utf8.in_byte = 5;
3462           else if (c >= 0xf8)
3463             st->utf8.in_byte = 4;
3464           else if (c >= 0xf0)
3465             st->utf8.in_byte = 3;
3466           else if (c >= 0xe0)
3467             st->utf8.in_byte = 2;
3468           else if (c >= 0xc0)
3469             st->utf8.in_byte = 1;
3470           else if (c >= 0x80)
3471             return 0;
3472           break;
3473         default:
3474           if ((c & 0xc0) != 0x80)
3475             return 0;
3476           else
3477             st->utf8.in_byte--;
3478         }
3479     }
3480   return CODING_CATEGORY_UTF8_MASK;
3481 }
3482
3483 static void
3484 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3485                     unsigned_char_dynarr *dst, unsigned int n)
3486 {
3487   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3488   unsigned int flags  = str->flags;
3489   unsigned int ch     = str->ch;
3490   eol_type_t eol_type = str->eol_type;
3491   unsigned char counter = str->counter;
3492
3493   while (n--)
3494     {
3495       unsigned char c = *src++;
3496       switch (counter)
3497         {
3498         case 0:
3499           if ( c >= 0xfc )
3500             {
3501               ch = c & 0x01;
3502               counter = 5;
3503             }
3504           else if ( c >= 0xf8 )
3505             {
3506               ch = c & 0x03;
3507               counter = 4;
3508             }
3509           else if ( c >= 0xf0 )
3510             {
3511               ch = c & 0x07;
3512               counter = 3;
3513             }
3514           else if ( c >= 0xe0 )
3515             {
3516               ch = c & 0x0f;
3517               counter = 2;
3518             }
3519           else if ( c >= 0xc0 )
3520             {
3521               ch = c & 0x1f;
3522               counter = 1;
3523             }
3524           else
3525             {
3526               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3527               DECODE_ADD_UCS_CHAR (c, dst);
3528             }
3529           break;
3530         case 1:
3531           ch = ( ch << 6 ) | ( c & 0x3f );
3532           DECODE_ADD_UCS_CHAR (ch, dst);
3533           ch = 0;
3534           counter = 0;
3535           break;
3536         default:
3537           ch = ( ch << 6 ) | ( c & 0x3f );
3538           counter--;
3539         }
3540     label_continue_loop:;
3541     }
3542
3543   if (flags & CODING_STATE_END)
3544     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3545
3546   str->flags = flags;
3547   str->ch    = ch;
3548   str->counter = counter;
3549 }
3550
3551 void
3552 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3553                   unsigned_char_dynarr *dst, unsigned int *flags)
3554 {
3555   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3556
3557   if (ch == '\n')
3558     {
3559       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3560         Dynarr_add (dst, '\r');
3561       if (eol_type != EOL_CR)
3562         Dynarr_add (dst, ch);
3563     }
3564   else if (ch <= 0x7f)
3565     {
3566       Dynarr_add (dst, ch);
3567     }
3568   else if (ch <= 0x7ff)
3569     {
3570       Dynarr_add (dst, (ch >> 6) | 0xc0);
3571       Dynarr_add (dst, (ch & 0x3f) | 0x80);
3572     }
3573   else if (ch <= 0xffff)
3574     {
3575       Dynarr_add (dst,  (ch >> 12) | 0xe0);
3576       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3577       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3578     }
3579   else if (ch <= 0x1fffff)
3580     {
3581       Dynarr_add (dst,  (ch >> 18) | 0xf0);
3582       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3583       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3584       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3585     }
3586   else if (ch <= 0x3ffffff)
3587     {
3588       Dynarr_add (dst,  (ch >> 24) | 0xf8);
3589       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3590       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3591       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3592       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3593     }
3594   else
3595     {
3596       Dynarr_add (dst,  (ch >> 30) | 0xfc);
3597       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3598       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3599       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3600       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3601       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3602     }
3603 }
3604
3605 void
3606 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3607                   unsigned int *flags)
3608 {
3609 }
3610
3611 \f
3612 /************************************************************************/
3613 /*                           ISO2022 methods                            */
3614 /************************************************************************/
3615
3616 /* The following note describes the coding system ISO2022 briefly.
3617    Since the intention of this note is to help understand the
3618    functions in this file, some parts are NOT ACCURATE or OVERLY
3619    SIMPLIFIED.  For thorough understanding, please refer to the
3620    original document of ISO2022.
3621
3622    ISO2022 provides many mechanisms to encode several character sets
3623    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3624    is encoded using bytes less than 128.  This may make the encoded
3625    text a little bit longer, but the text passes more easily through
3626    several gateways, some of which strip off MSB (Most Signigant Bit).
3627
3628    There are two kinds of character sets: control character set and
3629    graphic character set.  The former contains control characters such
3630    as `newline' and `escape' to provide control functions (control
3631    functions are also provided by escape sequences).  The latter
3632    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3633    two control character sets and many graphic character sets.
3634
3635    Graphic character sets are classified into one of the following
3636    four classes, according to the number of bytes (DIMENSION) and
3637    number of characters in one dimension (CHARS) of the set:
3638    - DIMENSION1_CHARS94
3639    - DIMENSION1_CHARS96
3640    - DIMENSION2_CHARS94
3641    - DIMENSION2_CHARS96
3642
3643    In addition, each character set is assigned an identification tag,
3644    unique for each set, called "final character" (denoted as <F>
3645    hereafter).  The <F> of each character set is decided by ECMA(*)
3646    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3647    (0x30..0x3F are for private use only).
3648
3649    Note (*): ECMA = European Computer Manufacturers Association
3650
3651    Here are examples of graphic character set [NAME(<F>)]:
3652         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3653         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3654         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3655         o DIMENSION2_CHARS96 -- none for the moment
3656
3657    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3658         C0 [0x00..0x1F] -- control character plane 0
3659         GL [0x20..0x7F] -- graphic character plane 0
3660         C1 [0x80..0x9F] -- control character plane 1
3661         GR [0xA0..0xFF] -- graphic character plane 1
3662
3663    A control character set is directly designated and invoked to C0 or
3664    C1 by an escape sequence.  The most common case is that:
3665    - ISO646's  control character set is designated/invoked to C0, and
3666    - ISO6429's control character set is designated/invoked to C1,
3667    and usually these designations/invocations are omitted in encoded
3668    text.  In a 7-bit environment, only C0 can be used, and a control
3669    character for C1 is encoded by an appropriate escape sequence to
3670    fit into the environment.  All control characters for C1 are
3671    defined to have corresponding escape sequences.
3672
3673    A graphic character set is at first designated to one of four
3674    graphic registers (G0 through G3), then these graphic registers are
3675    invoked to GL or GR.  These designations and invocations can be
3676    done independently.  The most common case is that G0 is invoked to
3677    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3678    these invocations and designations are omitted in encoded text.
3679    In a 7-bit environment, only GL can be used.
3680
3681    When a graphic character set of CHARS94 is invoked to GL, codes
3682    0x20 and 0x7F of the GL area work as control characters SPACE and
3683    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3684    be used.
3685
3686    There are two ways of invocation: locking-shift and single-shift.
3687    With locking-shift, the invocation lasts until the next different
3688    invocation, whereas with single-shift, the invocation affects the
3689    following character only and doesn't affect the locking-shift
3690    state.  Invocations are done by the following control characters or
3691    escape sequences:
3692
3693    ----------------------------------------------------------------------
3694    abbrev  function                  cntrl escape seq   description
3695    ----------------------------------------------------------------------
3696    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3697    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3698    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3699    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3700    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3701    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3702    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3703    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3704    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3705    ----------------------------------------------------------------------
3706    (*) These are not used by any known coding system.
3707
3708    Control characters for these functions are defined by macros
3709    ISO_CODE_XXX in `coding.h'.
3710
3711    Designations are done by the following escape sequences:
3712    ----------------------------------------------------------------------
3713    escape sequence      description
3714    ----------------------------------------------------------------------
3715    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3716    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3717    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3718    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3719    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
3720    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
3721    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
3722    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
3723    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
3724    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
3725    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
3726    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
3727    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
3728    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
3729    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
3730    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
3731    ----------------------------------------------------------------------
3732
3733    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3734    of dimension 1, chars 94, and final character <F>, etc...
3735
3736    Note (*): Although these designations are not allowed in ISO2022,
3737    Emacs accepts them on decoding, and produces them on encoding
3738    CHARS96 character sets in a coding system which is characterized as
3739    7-bit environment, non-locking-shift, and non-single-shift.
3740
3741    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3742    '(' can be omitted.  We refer to this as "short-form" hereafter.
3743
3744    Now you may notice that there are a lot of ways for encoding the
3745    same multilingual text in ISO2022.  Actually, there exist many
3746    coding systems such as Compound Text (used in X11's inter client
3747    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3748    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3749    localized platforms), and all of these are variants of ISO2022.
3750
3751    In addition to the above, Emacs handles two more kinds of escape
3752    sequences: ISO6429's direction specification and Emacs' private
3753    sequence for specifying character composition.
3754
3755    ISO6429's direction specification takes the following form:
3756         o CSI ']'      -- end of the current direction
3757         o CSI '0' ']'  -- end of the current direction
3758         o CSI '1' ']'  -- start of left-to-right text
3759         o CSI '2' ']'  -- start of right-to-left text
3760    The control character CSI (0x9B: control sequence introducer) is
3761    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3762
3763    Character composition specification takes the following form:
3764         o ESC '0' -- start character composition
3765         o ESC '1' -- end character composition
3766    Since these are not standard escape sequences of any ISO standard,
3767    their use with these meanings is restricted to Emacs only.  */
3768
3769 static void
3770 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3771 {
3772   int i;
3773
3774   for (i = 0; i < 4; i++)
3775     {
3776       if (!NILP (coding_system))
3777         iso->charset[i] =
3778           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3779       else
3780         iso->charset[i] = Qt;
3781       iso->invalid_designated[i] = 0;
3782     }
3783   iso->esc = ISO_ESC_NOTHING;
3784   iso->esc_bytes_index = 0;
3785   iso->register_left = 0;
3786   iso->register_right = 1;
3787   iso->switched_dir_and_no_valid_charset_yet = 0;
3788   iso->invalid_switch_dir = 0;
3789   iso->output_direction_sequence = 0;
3790   iso->output_literally = 0;
3791 #ifdef ENABLE_COMPOSITE_CHARS
3792   if (iso->composite_chars)
3793     Dynarr_reset (iso->composite_chars);
3794 #endif
3795 }
3796
3797 static int
3798 fit_to_be_escape_quoted (unsigned char c)
3799 {
3800   switch (c)
3801     {
3802     case ISO_CODE_ESC:
3803     case ISO_CODE_CSI:
3804     case ISO_CODE_SS2:
3805     case ISO_CODE_SS3:
3806     case ISO_CODE_SO:
3807     case ISO_CODE_SI:
3808       return 1;
3809
3810     default:
3811       return 0;
3812     }
3813 }
3814
3815 /* Parse one byte of an ISO2022 escape sequence.
3816    If the result is an invalid escape sequence, return 0 and
3817    do not change anything in STR.  Otherwise, if the result is
3818    an incomplete escape sequence, update ISO2022.ESC and
3819    ISO2022.ESC_BYTES and return -1.  Otherwise, update
3820    all the state variables (but not ISO2022.ESC_BYTES) and
3821    return 1.
3822
3823    If CHECK_INVALID_CHARSETS is non-zero, check for designation
3824    or invocation of an invalid character set and treat that as
3825    an unrecognized escape sequence. */
3826
3827 static int
3828 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3829                    unsigned char c, unsigned int *flags,
3830                    int check_invalid_charsets)
3831 {
3832   /* (1) If we're at the end of a designation sequence, CS is the
3833      charset being designated and REG is the register to designate
3834      it to.
3835
3836      (2) If we're at the end of a locking-shift sequence, REG is
3837      the register to invoke and HALF (0 == left, 1 == right) is
3838      the half to invoke it into.
3839
3840      (3) If we're at the end of a single-shift sequence, REG is
3841      the register to invoke. */
3842   Lisp_Object cs = Qnil;
3843   int reg, half;
3844
3845   /* NOTE: This code does goto's all over the fucking place.
3846      The reason for this is that we're basically implementing
3847      a state machine here, and hierarchical languages like C
3848      don't really provide a clean way of doing this. */
3849
3850   if (! (*flags & CODING_STATE_ESCAPE))
3851     /* At beginning of escape sequence; we need to reset our
3852        escape-state variables. */
3853     iso->esc = ISO_ESC_NOTHING;
3854
3855   iso->output_literally = 0;
3856   iso->output_direction_sequence = 0;
3857
3858   switch (iso->esc)
3859     {
3860     case ISO_ESC_NOTHING:
3861       iso->esc_bytes_index = 0;
3862       switch (c)
3863         {
3864         case ISO_CODE_ESC:      /* Start escape sequence */
3865           *flags |= CODING_STATE_ESCAPE;
3866           iso->esc = ISO_ESC;
3867           goto not_done;
3868
3869         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
3870           *flags |= CODING_STATE_ESCAPE;
3871           iso->esc = ISO_ESC_5_11;
3872           goto not_done;
3873
3874         case ISO_CODE_SO:       /* locking shift 1 */
3875           reg = 1; half = 0;
3876           goto locking_shift;
3877         case ISO_CODE_SI:       /* locking shift 0 */
3878           reg = 0; half = 0;
3879           goto locking_shift;
3880
3881         case ISO_CODE_SS2:      /* single shift */
3882           reg = 2;
3883           goto single_shift;
3884         case ISO_CODE_SS3:      /* single shift */
3885           reg = 3;
3886           goto single_shift;
3887
3888         default:                        /* Other control characters */
3889           return 0;
3890         }
3891
3892     case ISO_ESC:
3893       switch (c)
3894         {
3895           /**** single shift ****/
3896
3897         case 'N':       /* single shift 2 */
3898           reg = 2;
3899           goto single_shift;
3900         case 'O':       /* single shift 3 */
3901           reg = 3;
3902           goto single_shift;
3903
3904           /**** locking shift ****/
3905
3906         case '~':       /* locking shift 1 right */
3907           reg = 1; half = 1;
3908           goto locking_shift;
3909         case 'n':       /* locking shift 2 */
3910           reg = 2; half = 0;
3911           goto locking_shift;
3912         case '}':       /* locking shift 2 right */
3913           reg = 2; half = 1;
3914           goto locking_shift;
3915         case 'o':       /* locking shift 3 */
3916           reg = 3; half = 0;
3917           goto locking_shift;
3918         case '|':       /* locking shift 3 right */
3919           reg = 3; half = 1;
3920           goto locking_shift;
3921
3922 #ifdef ENABLE_COMPOSITE_CHARS
3923           /**** composite ****/
3924
3925         case '0':
3926           iso->esc = ISO_ESC_START_COMPOSITE;
3927           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3928             CODING_STATE_COMPOSITE;
3929           return 1;
3930
3931         case '1':
3932           iso->esc = ISO_ESC_END_COMPOSITE;
3933           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3934             ~CODING_STATE_COMPOSITE;
3935           return 1;
3936 #endif /* ENABLE_COMPOSITE_CHARS */
3937
3938           /**** directionality ****/
3939
3940         case '[':
3941           iso->esc = ISO_ESC_5_11;
3942           goto not_done;
3943
3944           /**** designation ****/
3945
3946         case '$':       /* multibyte charset prefix */
3947           iso->esc = ISO_ESC_2_4;
3948           goto not_done;
3949
3950         default:
3951           if (0x28 <= c && c <= 0x2F)
3952             {
3953               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3954               goto not_done;
3955             }
3956
3957           /* This function is called with CODESYS equal to nil when
3958              doing coding-system detection. */
3959           if (!NILP (codesys)
3960               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3961               && fit_to_be_escape_quoted (c))
3962             {
3963               iso->esc = ISO_ESC_LITERAL;
3964               *flags &= CODING_STATE_ISO2022_LOCK;
3965               return 1;
3966             }
3967
3968           /* bzzzt! */
3969           return 0;
3970         }
3971
3972
3973
3974       /**** directionality ****/
3975
3976     case ISO_ESC_5_11:          /* ISO6429 direction control */
3977       if (c == ']')
3978         {
3979           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3980           goto directionality;
3981         }
3982       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
3983       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3984       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3985       else               return 0;
3986       goto not_done;
3987
3988     case ISO_ESC_5_11_0:
3989       if (c == ']')
3990         {
3991           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3992           goto directionality;
3993         }
3994       return 0;
3995
3996     case ISO_ESC_5_11_1:
3997       if (c == ']')
3998         {
3999           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4000           goto directionality;
4001         }
4002       return 0;
4003
4004     case ISO_ESC_5_11_2:
4005       if (c == ']')
4006         {
4007           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4008           goto directionality;
4009         }
4010       return 0;
4011
4012     directionality:
4013       iso->esc = ISO_ESC_DIRECTIONALITY;
4014       /* Various junk here to attempt to preserve the direction sequences
4015          literally in the text if they would otherwise be swallowed due
4016          to invalid designations that don't show up as actual charset
4017          changes in the text. */
4018       if (iso->invalid_switch_dir)
4019         {
4020           /* We already inserted a direction switch literally into the
4021              text.  We assume (#### this may not be right) that the
4022              next direction switch is the one going the other way,
4023              and we need to output that literally as well. */
4024           iso->output_literally = 1;
4025           iso->invalid_switch_dir = 0;
4026         }
4027       else
4028         {
4029           int jj;
4030
4031           /* If we are in the thrall of an invalid designation,
4032            then stick the directionality sequence literally into the
4033            output stream so it ends up in the original text again. */
4034           for (jj = 0; jj < 4; jj++)
4035             if (iso->invalid_designated[jj])
4036               break;
4037           if (jj < 4)
4038             {
4039               iso->output_literally = 1;
4040               iso->invalid_switch_dir = 1;
4041             }
4042           else
4043             /* Indicate that we haven't yet seen a valid designation,
4044                so that if a switch-dir is directly followed by an
4045                invalid designation, both get inserted literally. */
4046             iso->switched_dir_and_no_valid_charset_yet = 1;
4047         }
4048       return 1;
4049
4050
4051       /**** designation ****/
4052
4053     case ISO_ESC_2_4:
4054       if (0x28 <= c && c <= 0x2F)
4055         {
4056           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4057           goto not_done;
4058         }
4059       if (0x40 <= c && c <= 0x42)
4060         {
4061           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4062                                       *flags & CODING_STATE_R2L ?
4063                                       CHARSET_RIGHT_TO_LEFT :
4064                                       CHARSET_LEFT_TO_RIGHT);
4065           reg = 0;
4066           goto designated;
4067         }
4068       return 0;
4069
4070     default:
4071       {
4072         int type =-1;
4073
4074         if (c < '0' || c > '~')
4075           return 0; /* bad final byte */
4076
4077         if (iso->esc >= ISO_ESC_2_8 &&
4078             iso->esc <= ISO_ESC_2_15)
4079           {
4080             type = ((iso->esc >= ISO_ESC_2_12) ?
4081                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4082             reg = (iso->esc - ISO_ESC_2_8) & 3;
4083           }
4084         else if (iso->esc >= ISO_ESC_2_4_8 &&
4085                  iso->esc <= ISO_ESC_2_4_15)
4086           {
4087             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4088                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4089             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4090           }
4091         else
4092           {
4093             /* Can this ever be reached? -slb */
4094             abort();
4095           }
4096
4097         cs = CHARSET_BY_ATTRIBUTES (type, c,
4098                                     *flags & CODING_STATE_R2L ?
4099                                     CHARSET_RIGHT_TO_LEFT :
4100                                     CHARSET_LEFT_TO_RIGHT);
4101         goto designated;
4102       }
4103     }
4104
4105  not_done:
4106   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4107   return -1;
4108
4109  single_shift:
4110   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4111     /* can't invoke something that ain't there. */
4112     return 0;
4113   iso->esc = ISO_ESC_SINGLE_SHIFT;
4114   *flags &= CODING_STATE_ISO2022_LOCK;
4115   if (reg == 2)
4116     *flags |= CODING_STATE_SS2;
4117   else
4118     *flags |= CODING_STATE_SS3;
4119   return 1;
4120
4121  locking_shift:
4122   if (check_invalid_charsets &&
4123       !CHARSETP (iso->charset[reg]))
4124     /* can't invoke something that ain't there. */
4125     return 0;
4126   if (half)
4127     iso->register_right = reg;
4128   else
4129     iso->register_left = reg;
4130   *flags &= CODING_STATE_ISO2022_LOCK;
4131   iso->esc = ISO_ESC_LOCKING_SHIFT;
4132   return 1;
4133
4134  designated:
4135   if (NILP (cs) && check_invalid_charsets)
4136     {
4137       iso->invalid_designated[reg] = 1;
4138       iso->charset[reg] = Vcharset_ascii;
4139       iso->esc = ISO_ESC_DESIGNATE;
4140       *flags &= CODING_STATE_ISO2022_LOCK;
4141       iso->output_literally = 1;
4142       if (iso->switched_dir_and_no_valid_charset_yet)
4143         {
4144           /* We encountered a switch-direction followed by an
4145              invalid designation.  Ensure that the switch-direction
4146              gets outputted; otherwise it will probably get eaten
4147              when the text is written out again. */
4148           iso->switched_dir_and_no_valid_charset_yet = 0;
4149           iso->output_direction_sequence = 1;
4150           /* And make sure that the switch-dir going the other
4151              way gets outputted, as well. */
4152           iso->invalid_switch_dir = 1;
4153         }
4154       return 1;
4155     }
4156   /* This function is called with CODESYS equal to nil when
4157      doing coding-system detection. */
4158   if (!NILP (codesys))
4159     {
4160       charset_conversion_spec_dynarr *dyn =
4161         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4162
4163       if (dyn)
4164         {
4165           int i;
4166
4167           for (i = 0; i < Dynarr_length (dyn); i++)
4168             {
4169               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4170               if (EQ (cs, spec->from_charset))
4171                 cs = spec->to_charset;
4172             }
4173         }
4174     }
4175
4176   iso->charset[reg] = cs;
4177   iso->esc = ISO_ESC_DESIGNATE;
4178   *flags &= CODING_STATE_ISO2022_LOCK;
4179   if (iso->invalid_designated[reg])
4180     {
4181       iso->invalid_designated[reg] = 0;
4182       iso->output_literally = 1;
4183     }
4184   if (iso->switched_dir_and_no_valid_charset_yet)
4185     iso->switched_dir_and_no_valid_charset_yet = 0;
4186   return 1;
4187 }
4188
4189 static int
4190 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4191                        unsigned int n)
4192 {
4193   int mask;
4194
4195   /* #### There are serious deficiencies in the recognition mechanism
4196      here.  This needs to be much smarter if it's going to cut it.
4197      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4198      it should be detected as Latin-1.
4199      All the ISO2022 stuff in this file should be synced up with the
4200      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4201      Perhaps we should wait till R2L works in FSF Emacs? */
4202
4203   if (!st->iso2022.initted)
4204     {
4205       reset_iso2022 (Qnil, &st->iso2022.iso);
4206       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4207                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4208                           CODING_CATEGORY_ISO_8_1_MASK |
4209                           CODING_CATEGORY_ISO_8_2_MASK |
4210                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4211       st->iso2022.flags = 0;
4212       st->iso2022.high_byte_count = 0;
4213       st->iso2022.saw_single_shift = 0;
4214       st->iso2022.initted = 1;
4215     }
4216
4217   mask = st->iso2022.mask;
4218
4219   while (n--)
4220     {
4221       int c = *src++;
4222       if (c >= 0xA0)
4223         {
4224           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4225           st->iso2022.high_byte_count++;
4226         }
4227       else
4228         {
4229           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4230             {
4231               if (st->iso2022.high_byte_count & 1)
4232                 /* odd number of high bytes; assume not iso-8-2 */
4233                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4234             }
4235           st->iso2022.high_byte_count = 0;
4236           st->iso2022.saw_single_shift = 0;
4237           if (c > 0x80)
4238             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4239         }
4240       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4241           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4242         { /* control chars */
4243           switch (c)
4244             {
4245               /* Allow and ignore control characters that you might
4246                  reasonably see in a text file */
4247             case '\r':
4248             case '\n':
4249             case '\t':
4250             case  7: /* bell */
4251             case  8: /* backspace */
4252             case 11: /* vertical tab */
4253             case 12: /* form feed */
4254             case 26: /* MS-DOS C-z junk */
4255             case 31: /* '^_' -- for info */
4256               goto label_continue_loop;
4257
4258             default:
4259               break;
4260             }
4261         }
4262
4263       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4264           || BYTE_C1_P (c))
4265         {
4266           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4267                                  &st->iso2022.flags, 0))
4268             {
4269               switch (st->iso2022.iso.esc)
4270                 {
4271                 case ISO_ESC_DESIGNATE:
4272                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4273                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4274                   break;
4275                 case ISO_ESC_LOCKING_SHIFT:
4276                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4277                   goto ran_out_of_chars;
4278                 case ISO_ESC_SINGLE_SHIFT:
4279                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4280                   st->iso2022.saw_single_shift = 1;
4281                   break;
4282                 default:
4283                   break;
4284                 }
4285             }
4286           else
4287             {
4288               mask = 0;
4289               goto ran_out_of_chars;
4290             }
4291         }
4292     label_continue_loop:;
4293     }
4294
4295  ran_out_of_chars:
4296
4297   return mask;
4298 }
4299
4300 static int
4301 postprocess_iso2022_mask (int mask)
4302 {
4303   /* #### kind of cheesy */
4304   /* If seven-bit ISO is allowed, then assume that the encoding is
4305      entirely seven-bit and turn off the eight-bit ones. */
4306   if (mask & CODING_CATEGORY_ISO_7_MASK)
4307     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4308                CODING_CATEGORY_ISO_8_1_MASK |
4309                CODING_CATEGORY_ISO_8_2_MASK);
4310   return mask;
4311 }
4312
4313 /* If FLAGS is a null pointer or specifies right-to-left motion,
4314    output a switch-dir-to-left-to-right sequence to DST.
4315    Also update FLAGS if it is not a null pointer.
4316    If INTERNAL_P is set, we are outputting in internal format and
4317    need to handle the CSI differently. */
4318
4319 static void
4320 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4321                                  unsigned_char_dynarr *dst,
4322                                  unsigned int *flags,
4323                                  int internal_p)
4324 {
4325   if (!flags || (*flags & CODING_STATE_R2L))
4326     {
4327       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4328         {
4329           Dynarr_add (dst, ISO_CODE_ESC);
4330           Dynarr_add (dst, '[');
4331         }
4332       else if (internal_p)
4333         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4334       else
4335         Dynarr_add (dst, ISO_CODE_CSI);
4336       Dynarr_add (dst, '0');
4337       Dynarr_add (dst, ']');
4338       if (flags)
4339         *flags &= ~CODING_STATE_R2L;
4340     }
4341 }
4342
4343 /* If FLAGS is a null pointer or specifies a direction different from
4344    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4345    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4346    sequence to DST.  Also update FLAGS if it is not a null pointer.
4347    If INTERNAL_P is set, we are outputting in internal format and
4348    need to handle the CSI differently. */
4349
4350 static void
4351 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4352                           unsigned_char_dynarr *dst, unsigned int *flags,
4353                           int internal_p)
4354 {
4355   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4356       direction == CHARSET_LEFT_TO_RIGHT)
4357     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4358   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4359            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4360            direction == CHARSET_RIGHT_TO_LEFT)
4361     {
4362       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4363         {
4364           Dynarr_add (dst, ISO_CODE_ESC);
4365           Dynarr_add (dst, '[');
4366         }
4367       else if (internal_p)
4368         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4369       else
4370         Dynarr_add (dst, ISO_CODE_CSI);
4371       Dynarr_add (dst, '2');
4372       Dynarr_add (dst, ']');
4373       if (flags)
4374         *flags |= CODING_STATE_R2L;
4375     }
4376 }
4377
4378 /* Convert ISO2022-format data to internal format. */
4379
4380 static void
4381 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4382                        unsigned_char_dynarr *dst, unsigned int n)
4383 {
4384   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4385   unsigned int flags  = str->flags;
4386   unsigned int ch     = str->ch;
4387   eol_type_t eol_type = str->eol_type;
4388 #ifdef ENABLE_COMPOSITE_CHARS
4389   unsigned_char_dynarr *real_dst = dst;
4390 #endif
4391   Lisp_Object coding_system;
4392
4393   XSETCODING_SYSTEM (coding_system, str->codesys);
4394
4395 #ifdef ENABLE_COMPOSITE_CHARS
4396   if (flags & CODING_STATE_COMPOSITE)
4397     dst = str->iso2022.composite_chars;
4398 #endif /* ENABLE_COMPOSITE_CHARS */
4399
4400   while (n--)
4401     {
4402       unsigned char c = *src++;
4403       if (flags & CODING_STATE_ESCAPE)
4404         {       /* Within ESC sequence */
4405           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4406                                           c, &flags, 1);
4407
4408           if (retval)
4409             {
4410               switch (str->iso2022.esc)
4411                 {
4412 #ifdef ENABLE_COMPOSITE_CHARS
4413                 case ISO_ESC_START_COMPOSITE:
4414                   if (str->iso2022.composite_chars)
4415                     Dynarr_reset (str->iso2022.composite_chars);
4416                   else
4417                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4418                   dst = str->iso2022.composite_chars;
4419                   break;
4420                 case ISO_ESC_END_COMPOSITE:
4421                   {
4422                     Bufbyte comstr[MAX_EMCHAR_LEN];
4423                     Bytecount len;
4424                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4425                                                          Dynarr_length (dst));
4426                     dst = real_dst;
4427                     len = set_charptr_emchar (comstr, emch);
4428                     Dynarr_add_many (dst, comstr, len);
4429                     break;
4430                   }
4431 #endif /* ENABLE_COMPOSITE_CHARS */
4432
4433                 case ISO_ESC_LITERAL:
4434                   DECODE_ADD_BINARY_CHAR (c, dst);
4435                   break;
4436
4437                 default:
4438                   /* Everything else handled already */
4439                   break;
4440                 }
4441             }
4442
4443           /* Attempted error recovery. */
4444           if (str->iso2022.output_direction_sequence)
4445             ensure_correct_direction (flags & CODING_STATE_R2L ?
4446                                       CHARSET_RIGHT_TO_LEFT :
4447                                       CHARSET_LEFT_TO_RIGHT,
4448                                       str->codesys, dst, 0, 1);
4449           /* More error recovery. */
4450           if (!retval || str->iso2022.output_literally)
4451             {
4452               /* Output the (possibly invalid) sequence */
4453               int i;
4454               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4455                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4456               flags &= CODING_STATE_ISO2022_LOCK;
4457               if (!retval)
4458                 n++, src--;/* Repeat the loop with the same character. */
4459               else
4460                 {
4461                   /* No sense in reprocessing the final byte of the
4462                      escape sequence; it could mess things up anyway.
4463                      Just add it now. */
4464                   DECODE_ADD_BINARY_CHAR (c, dst);
4465                 }
4466             }
4467           ch = 0;
4468         }
4469       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4470         { /* Control characters */
4471
4472           /***** Error-handling *****/
4473
4474           /* If we were in the middle of a character, dump out the
4475              partial character. */
4476           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4477
4478           /* If we just saw a single-shift character, dump it out.
4479              This may dump out the wrong sort of single-shift character,
4480              but least it will give an indication that something went
4481              wrong. */
4482           if (flags & CODING_STATE_SS2)
4483             {
4484               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4485               flags &= ~CODING_STATE_SS2;
4486             }
4487           if (flags & CODING_STATE_SS3)
4488             {
4489               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4490               flags &= ~CODING_STATE_SS3;
4491             }
4492
4493           /***** Now handle the control characters. *****/
4494
4495           /* Handle CR/LF */
4496           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4497
4498           flags &= CODING_STATE_ISO2022_LOCK;
4499
4500           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4501             DECODE_ADD_BINARY_CHAR (c, dst);
4502         }
4503       else
4504         {                       /* Graphic characters */
4505           Lisp_Object charset;
4506 #ifndef UTF2000
4507           Charset_ID lb;
4508 #endif
4509           int reg;
4510
4511           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4512
4513           /* Now determine the charset. */
4514           reg = ((flags & CODING_STATE_SS2) ? 2
4515                  : (flags & CODING_STATE_SS3) ? 3
4516                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4517                  : str->iso2022.register_left);
4518           charset = str->iso2022.charset[reg];
4519
4520           /* Error checking: */
4521           if (! CHARSETP (charset)
4522               || str->iso2022.invalid_designated[reg]
4523               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4524                   && XCHARSET_CHARS (charset) == 94))
4525             /* Mrmph.  We are trying to invoke a register that has no
4526                or an invalid charset in it, or trying to add a character
4527                outside the range of the charset.  Insert that char literally
4528                to preserve it for the output. */
4529             {
4530               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4531               DECODE_ADD_BINARY_CHAR (c, dst);
4532             }
4533
4534           else
4535             {
4536               /* Things are probably hunky-dorey. */
4537
4538               /* Fetch reverse charset, maybe. */
4539               if (((flags & CODING_STATE_R2L) &&
4540                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4541                   ||
4542                   (!(flags & CODING_STATE_R2L) &&
4543                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4544                 {
4545                   Lisp_Object new_charset =
4546                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4547                   if (!NILP (new_charset))
4548                     charset = new_charset;
4549                 }
4550
4551 #ifdef UTF2000
4552               if (XCHARSET_DIMENSION (charset) == 1)
4553                 {
4554                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4555                   DECODE_ADD_UCS_CHAR
4556                     (MAKE_CHAR (charset, c & 0x7F, 0), dst);
4557                 }
4558               else if (ch)
4559                 {
4560                   DECODE_ADD_UCS_CHAR
4561                     (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
4562                   ch = 0;
4563                 }
4564               else
4565                 ch = c;
4566 #else
4567               lb = XCHARSET_LEADING_BYTE (charset);
4568               switch (XCHARSET_REP_BYTES (charset))
4569                 {
4570                 case 1: /* ASCII */
4571                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4572                   Dynarr_add (dst, c & 0x7F);
4573                   break;
4574
4575                 case 2: /* one-byte official */
4576                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4577                   Dynarr_add (dst, lb);
4578                   Dynarr_add (dst, c | 0x80);
4579                   break;
4580
4581                 case 3: /* one-byte private or two-byte official */
4582                   if (XCHARSET_PRIVATE_P (charset))
4583                     {
4584                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4585                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4586                       Dynarr_add (dst, lb);
4587                       Dynarr_add (dst, c | 0x80);
4588                     }
4589                   else
4590                     {
4591                       if (ch)
4592                         {
4593                           Dynarr_add (dst, lb);
4594                           Dynarr_add (dst, ch | 0x80);
4595                           Dynarr_add (dst, c | 0x80);
4596                           ch = 0;
4597                         }
4598                       else
4599                         ch = c;
4600                     }
4601                   break;
4602
4603                 default:        /* two-byte private */
4604                   if (ch)
4605                     {
4606                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4607                       Dynarr_add (dst, lb);
4608                       Dynarr_add (dst, ch | 0x80);
4609                       Dynarr_add (dst, c | 0x80);
4610                       ch = 0;
4611                     }
4612                   else
4613                     ch = c;
4614                 }
4615 #endif
4616             }
4617
4618           if (!ch)
4619             flags &= CODING_STATE_ISO2022_LOCK;
4620         }
4621
4622     label_continue_loop:;
4623     }
4624
4625   if (flags & CODING_STATE_END)
4626     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4627
4628   str->flags = flags;
4629   str->ch    = ch;
4630 }
4631
4632
4633 /***** ISO2022 encoder *****/
4634
4635 /* Designate CHARSET into register REG. */
4636
4637 static void
4638 iso2022_designate (Lisp_Object charset, unsigned char reg,
4639                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4640 {
4641   static CONST char inter94[] = "()*+";
4642   static CONST char inter96[] = ",-./";
4643   unsigned int type;
4644   unsigned char final;
4645   Lisp_Object old_charset = str->iso2022.charset[reg];
4646
4647   str->iso2022.charset[reg] = charset;
4648   if (!CHARSETP (charset))
4649     /* charset might be an initial nil or t. */
4650     return;
4651   type = XCHARSET_TYPE (charset);
4652   final = XCHARSET_FINAL (charset);
4653   if (!str->iso2022.force_charset_on_output[reg] &&
4654       CHARSETP (old_charset) &&
4655       XCHARSET_TYPE (old_charset) == type &&
4656       XCHARSET_FINAL (old_charset) == final)
4657     return;
4658
4659   str->iso2022.force_charset_on_output[reg] = 0;
4660
4661   {
4662     charset_conversion_spec_dynarr *dyn =
4663       str->codesys->iso2022.output_conv;
4664
4665     if (dyn)
4666       {
4667         int i;
4668
4669         for (i = 0; i < Dynarr_length (dyn); i++)
4670           {
4671             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4672             if (EQ (charset, spec->from_charset))
4673                 charset = spec->to_charset;
4674           }
4675       }
4676   }
4677
4678   Dynarr_add (dst, ISO_CODE_ESC);
4679   switch (type)
4680     {
4681     case CHARSET_TYPE_94:
4682       Dynarr_add (dst, inter94[reg]);
4683       break;
4684     case CHARSET_TYPE_96:
4685       Dynarr_add (dst, inter96[reg]);
4686       break;
4687     case CHARSET_TYPE_94X94:
4688       Dynarr_add (dst, '$');
4689       if (reg != 0
4690           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4691           || final < '@'
4692           || final > 'B')
4693         Dynarr_add (dst, inter94[reg]);
4694       break;
4695     case CHARSET_TYPE_96X96:
4696       Dynarr_add (dst, '$');
4697       Dynarr_add (dst, inter96[reg]);
4698       break;
4699     }
4700   Dynarr_add (dst, final);
4701 }
4702
4703 static void
4704 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4705 {
4706   if (str->iso2022.register_left != 0)
4707     {
4708       Dynarr_add (dst, ISO_CODE_SI);
4709       str->iso2022.register_left = 0;
4710     }
4711 }
4712
4713 static void
4714 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4715 {
4716   if (str->iso2022.register_left != 1)
4717     {
4718       Dynarr_add (dst, ISO_CODE_SO);
4719       str->iso2022.register_left = 1;
4720     }
4721 }
4722
4723 void
4724 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4725                      unsigned_char_dynarr *dst, unsigned int *flags)
4726 {
4727   unsigned char charmask;
4728   Lisp_Coding_System* codesys = str->codesys;
4729   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
4730   int i;
4731   Lisp_Object charset = str->iso2022.current_charset;
4732   int half = str->iso2022.current_half;
4733   unsigned int byte1, byte2;
4734
4735   if (ch <= 0x7F)
4736     {
4737       restore_left_to_right_direction (codesys, dst, flags, 0);
4738
4739       /* Make sure G0 contains ASCII */
4740       if ((ch > ' ' && ch < ISO_CODE_DEL)
4741           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4742         {
4743           ensure_normal_shift (str, dst);
4744           iso2022_designate (Vcharset_ascii, 0, str, dst);
4745         }
4746
4747       /* If necessary, restore everything to the default state
4748          at end-of-line */
4749       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4750         {
4751           restore_left_to_right_direction (codesys, dst, flags, 0);
4752
4753           ensure_normal_shift (str, dst);
4754
4755           for (i = 0; i < 4; i++)
4756             {
4757               Lisp_Object initial_charset =
4758                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4759               iso2022_designate (initial_charset, i, str, dst);
4760             }
4761         }
4762       if (ch == '\n')
4763         {
4764           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4765             Dynarr_add (dst, '\r');
4766           if (eol_type != EOL_CR)
4767             Dynarr_add (dst, ch);
4768         }
4769       else
4770         {
4771           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4772               && fit_to_be_escape_quoted (ch))
4773             Dynarr_add (dst, ISO_CODE_ESC);
4774           Dynarr_add (dst, ch);
4775         }
4776     }
4777   else if ( (0x80 <= ch) && (ch <= 0x9f) )
4778     {
4779       charmask = (half == 0 ? 0x00 : 0x80);
4780
4781       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4782           && fit_to_be_escape_quoted (ch))
4783         Dynarr_add (dst, ISO_CODE_ESC);
4784       /* you asked for it ... */
4785       Dynarr_add (dst, ch);
4786     }
4787   else
4788     {
4789       int reg;
4790
4791       /* Now determine which register to use. */
4792       reg = -1;
4793       for (i = 0; i < 4; i++)
4794         {
4795           Lisp_Object code_point;
4796
4797           if ((CHARSETP (charset = str->iso2022.charset[i])
4798                && !EQ (code_point = charset_code_point (charset, ch), Qnil))
4799               ||
4800               (CHARSETP
4801                (charset
4802                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
4803                && !EQ (code_point = charset_code_point (charset, ch), Qnil)))
4804             {
4805               Lisp_Object ret = Fcar (code_point);
4806
4807               if (INTP (ret))
4808                 {
4809                   byte1 = XINT (ret);
4810                   ret = Fcar (Fcdr (code_point));
4811                   if (INTP (ret))
4812                     byte2 = XINT (ret);
4813                   else
4814                     byte2 = 0;
4815                 }
4816               else
4817                 {
4818                   byte1 = 0;
4819                   byte2 = 0;
4820                 }
4821               reg = i;
4822               break;
4823             }
4824         }
4825       if (reg == -1)
4826         {
4827           Lisp_Object original_default_coded_charset_priority_list
4828             = Vdefault_coded_charset_priority_list;
4829
4830           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
4831             {
4832               BREAKUP_CHAR (ch, charset, byte1, byte2);
4833               if (XCHARSET_FINAL (charset))
4834                 goto found;
4835               Vdefault_coded_charset_priority_list
4836                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
4837                                Vdefault_coded_charset_priority_list));
4838             }
4839           BREAKUP_CHAR (ch, charset, byte1, byte2);
4840         found:
4841           Vdefault_coded_charset_priority_list
4842             = original_default_coded_charset_priority_list;
4843         }
4844       ensure_correct_direction (XCHARSET_DIRECTION (charset),
4845                                 codesys, dst, flags, 0);
4846
4847       if (reg == -1)
4848         {
4849           if (XCHARSET_GRAPHIC (charset) != 0)
4850             {
4851               if (!NILP (str->iso2022.charset[1]) &&
4852                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
4853                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4854                 reg = 1;
4855               else if (!NILP (str->iso2022.charset[2]))
4856                 reg = 2;
4857               else if (!NILP (str->iso2022.charset[3]))
4858                 reg = 3;
4859               else
4860                 reg = 0;
4861             }
4862           else
4863             reg = 0;
4864         }
4865
4866       iso2022_designate (charset, reg, str, dst);
4867
4868       /* Now invoke that register. */
4869       switch (reg)
4870         {
4871         case 0:
4872           ensure_normal_shift (str, dst);
4873           half = 0;
4874           break;
4875         case 1:
4876           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4877             {
4878               ensure_shift_out (str, dst);
4879               half = 0;
4880             }
4881           else
4882             half = 1;
4883           break;
4884         case 2:
4885           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4886             {
4887               Dynarr_add (dst, ISO_CODE_ESC);
4888               Dynarr_add (dst, 'N');
4889               half = 0;
4890             }
4891           else
4892             {
4893               Dynarr_add (dst, ISO_CODE_SS2);
4894               half = 1;
4895             }
4896           break;
4897         case 3:
4898           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4899             {
4900               Dynarr_add (dst, ISO_CODE_ESC);
4901               Dynarr_add (dst, 'O');
4902               half = 0;
4903             }
4904           else
4905             {
4906               Dynarr_add (dst, ISO_CODE_SS3);
4907               half = 1;
4908             }
4909           break;
4910         default:
4911           abort ();
4912         }
4913
4914       charmask = (half == 0 ? 0x00 : 0x80);
4915
4916       switch (XCHARSET_DIMENSION (charset))
4917         {
4918         case 1:
4919           Dynarr_add (dst, byte1 | charmask);
4920           break;
4921         case 2:
4922           Dynarr_add (dst, byte1 | charmask);
4923           Dynarr_add (dst, byte2 | charmask);
4924           break;
4925         default:
4926           abort ();
4927         }
4928     }
4929   str->iso2022.current_charset = charset;
4930   str->iso2022.current_half = half;
4931 }
4932
4933 void
4934 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4935                      unsigned int *flags)
4936 {
4937   Lisp_Coding_System* codesys = str->codesys;
4938   int i;
4939
4940   restore_left_to_right_direction (codesys, dst, flags, 0);
4941   ensure_normal_shift (str, dst);
4942   for (i = 0; i < 4; i++)
4943     {
4944       Lisp_Object initial_charset
4945         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4946       iso2022_designate (initial_charset, i, str, dst);
4947     }
4948 }
4949 #endif /* MULE */
4950 \f
4951 /************************************************************************/
4952 /*                     No-conversion methods                            */
4953 /************************************************************************/
4954
4955 /* This is used when reading in "binary" files -- i.e. files that may
4956    contain all 256 possible byte values and that are not to be
4957    interpreted as being in any particular decoding. */
4958 static void
4959 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4960                              unsigned_char_dynarr *dst, unsigned int n)
4961 {
4962   unsigned char c;
4963   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4964   unsigned int flags  = str->flags;
4965   unsigned int ch     = str->ch;
4966   eol_type_t eol_type = str->eol_type;
4967
4968   while (n--)
4969     {
4970       c = *src++;
4971
4972       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4973       DECODE_ADD_BINARY_CHAR (c, dst);
4974     label_continue_loop:;
4975     }
4976
4977   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4978
4979   str->flags = flags;
4980   str->ch    = ch;
4981 }
4982
4983 static void
4984 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4985                              unsigned_char_dynarr *dst, unsigned int n)
4986 {
4987   unsigned char c;
4988   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4989   unsigned int flags  = str->flags;
4990   unsigned int ch     = str->ch;
4991   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4992 #ifdef UTF2000
4993   unsigned char char_boundary = str->iso2022.current_char_boundary;
4994 #endif
4995
4996   while (n--)
4997     {
4998       c = *src++;
4999 #ifdef UTF2000
5000       if (char_boundary == 0)
5001         if ( c >= 0xfc )
5002           {
5003             ch = c & 0x01;
5004             char_boundary = 5;
5005           }
5006         else if ( c >= 0xf8 )
5007           {
5008             ch = c & 0x03;
5009             char_boundary = 4;
5010           }
5011         else if ( c >= 0xf0 )
5012           {
5013             ch = c & 0x07;
5014             char_boundary = 3;
5015           }
5016         else if ( c >= 0xe0 )
5017           {
5018             ch = c & 0x0f;
5019             char_boundary = 2;
5020           }
5021         else if ( c >= 0xc0 )
5022           {
5023             ch = c & 0x1f;
5024             char_boundary = 1;
5025           }
5026         else
5027           {
5028             ch = 0;
5029             if (c == '\n')
5030               {
5031                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5032                   Dynarr_add (dst, '\r');
5033                 if (eol_type != EOL_CR)
5034                   Dynarr_add (dst, c);
5035               }
5036             else
5037               Dynarr_add (dst, c);
5038             char_boundary = 0;
5039           }
5040       else if (char_boundary == 1)
5041         {
5042           ch = ( ch << 6 ) | ( c & 0x3f );
5043           Dynarr_add (dst, ch & 0xff);
5044           char_boundary = 0;
5045         }
5046       else
5047         {
5048           ch = ( ch << 6 ) | ( c & 0x3f );
5049           char_boundary--;
5050         }
5051 #else /* not UTF2000 */
5052       if (c == '\n')
5053         {
5054           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5055             Dynarr_add (dst, '\r');
5056           if (eol_type != EOL_CR)
5057             Dynarr_add (dst, '\n');
5058           ch = 0;
5059         }
5060       else if (BYTE_ASCII_P (c))
5061         {
5062           assert (ch == 0);
5063           Dynarr_add (dst, c);
5064         }
5065       else if (BUFBYTE_LEADING_BYTE_P (c))
5066         {
5067           assert (ch == 0);
5068           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5069               c == LEADING_BYTE_CONTROL_1)
5070             ch = c;
5071           else
5072             Dynarr_add (dst, '~'); /* untranslatable character */
5073         }
5074       else
5075         {
5076           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5077             Dynarr_add (dst, c);
5078           else if (ch == LEADING_BYTE_CONTROL_1)
5079             {
5080               assert (c < 0xC0);
5081               Dynarr_add (dst, c - 0x20);
5082             }
5083           /* else it should be the second or third byte of an
5084              untranslatable character, so ignore it */
5085           ch = 0;
5086         }
5087 #endif /* not UTF2000 */
5088     }
5089
5090   str->flags = flags;
5091   str->ch    = ch;
5092 #ifdef UTF2000
5093   str->iso2022.current_char_boundary = char_boundary;
5094 #endif
5095 }
5096
5097 \f
5098 /************************************************************************/
5099 /*                   Simple internal/external functions                 */
5100 /************************************************************************/
5101
5102 static Extbyte_dynarr *conversion_out_dynarr;
5103 static Bufbyte_dynarr *conversion_in_dynarr;
5104
5105 /* Determine coding system from coding format */
5106
5107 /* #### not correct for all values of `fmt'! */
5108 static Lisp_Object
5109 external_data_format_to_coding_system (enum external_data_format fmt)
5110 {
5111   switch (fmt)
5112     {
5113     case FORMAT_FILENAME:
5114     case FORMAT_TERMINAL:
5115       if (EQ (Vfile_name_coding_system, Qnil) ||
5116           EQ (Vfile_name_coding_system, Qbinary))
5117         return Qnil;
5118       else
5119         return Fget_coding_system (Vfile_name_coding_system);
5120 #ifdef MULE
5121     case FORMAT_CTEXT:
5122       return Fget_coding_system (Qctext);
5123 #endif
5124     default:
5125       return Qnil;
5126     }
5127 }
5128
5129 Extbyte *
5130 convert_to_external_format (CONST Bufbyte *ptr,
5131                             Bytecount len,
5132                             Extcount *len_out,
5133                             enum external_data_format fmt)
5134 {
5135   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5136
5137   if (!conversion_out_dynarr)
5138     conversion_out_dynarr = Dynarr_new (Extbyte);
5139   else
5140     Dynarr_reset (conversion_out_dynarr);
5141
5142   if (NILP (coding_system))
5143     {
5144       CONST Bufbyte *end = ptr + len;
5145
5146       for (; ptr < end;)
5147         {
5148 #ifdef UTF2000
5149           Bufbyte c =
5150             (*ptr < 0xc0) ? *ptr :
5151             ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5152 #else
5153           Bufbyte c =
5154             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5155             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5156             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5157             '~';
5158 #endif
5159           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5160           INC_CHARPTR (ptr);
5161         }
5162
5163 #ifdef ERROR_CHECK_BUFPOS
5164       assert (ptr == end);
5165 #endif
5166     }
5167   else
5168     {
5169       Lisp_Object instream, outstream, da_outstream;
5170       Lstream *istr, *ostr;
5171       struct gcpro gcpro1, gcpro2, gcpro3;
5172       char tempbuf[1024]; /* some random amount */
5173
5174       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5175       da_outstream = make_dynarr_output_stream
5176         ((unsigned_char_dynarr *) conversion_out_dynarr);
5177       outstream =
5178         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5179       istr = XLSTREAM (instream);
5180       ostr = XLSTREAM (outstream);
5181       GCPRO3 (instream, outstream, da_outstream);
5182       while (1)
5183         {
5184           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5185           if (!size_in_bytes)
5186             break;
5187           Lstream_write (ostr, tempbuf, size_in_bytes);
5188         }
5189       Lstream_close (istr);
5190       Lstream_close (ostr);
5191       UNGCPRO;
5192       Lstream_delete (istr);
5193       Lstream_delete (ostr);
5194       Lstream_delete (XLSTREAM (da_outstream));
5195     }
5196
5197   *len_out = Dynarr_length (conversion_out_dynarr);
5198   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5199   return Dynarr_atp (conversion_out_dynarr, 0);
5200 }
5201
5202 Bufbyte *
5203 convert_from_external_format (CONST Extbyte *ptr,
5204                               Extcount len,
5205                               Bytecount *len_out,
5206                               enum external_data_format fmt)
5207 {
5208   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5209
5210   if (!conversion_in_dynarr)
5211     conversion_in_dynarr = Dynarr_new (Bufbyte);
5212   else
5213     Dynarr_reset (conversion_in_dynarr);
5214
5215   if (NILP (coding_system))
5216     {
5217       CONST Extbyte *end = ptr + len;
5218       for (; ptr < end; ptr++)
5219         {
5220           Extbyte c = *ptr;
5221           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5222         }
5223     }
5224   else
5225     {
5226       Lisp_Object instream, outstream, da_outstream;
5227       Lstream *istr, *ostr;
5228       struct gcpro gcpro1, gcpro2, gcpro3;
5229       char tempbuf[1024]; /* some random amount */
5230
5231       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5232       da_outstream = make_dynarr_output_stream
5233         ((unsigned_char_dynarr *) conversion_in_dynarr);
5234       outstream =
5235         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5236       istr = XLSTREAM (instream);
5237       ostr = XLSTREAM (outstream);
5238       GCPRO3 (instream, outstream, da_outstream);
5239       while (1)
5240         {
5241           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5242           if (!size_in_bytes)
5243             break;
5244           Lstream_write (ostr, tempbuf, size_in_bytes);
5245         }
5246       Lstream_close (istr);
5247       Lstream_close (ostr);
5248       UNGCPRO;
5249       Lstream_delete (istr);
5250       Lstream_delete (ostr);
5251       Lstream_delete (XLSTREAM (da_outstream));
5252     }
5253
5254   *len_out = Dynarr_length (conversion_in_dynarr);
5255   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5256   return Dynarr_atp (conversion_in_dynarr, 0);
5257 }
5258
5259 \f
5260 /************************************************************************/
5261 /*                             Initialization                           */
5262 /************************************************************************/
5263
5264 void
5265 syms_of_file_coding (void)
5266 {
5267   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5268   deferror (&Qcoding_system_error, "coding-system-error",
5269             "Coding-system error", Qio_error);
5270
5271   DEFSUBR (Fcoding_system_p);
5272   DEFSUBR (Ffind_coding_system);
5273   DEFSUBR (Fget_coding_system);
5274   DEFSUBR (Fcoding_system_list);
5275   DEFSUBR (Fcoding_system_name);
5276   DEFSUBR (Fmake_coding_system);
5277   DEFSUBR (Fcopy_coding_system);
5278   DEFSUBR (Fdefine_coding_system_alias);
5279   DEFSUBR (Fsubsidiary_coding_system);
5280
5281   DEFSUBR (Fcoding_system_type);
5282   DEFSUBR (Fcoding_system_doc_string);
5283 #ifdef MULE
5284   DEFSUBR (Fcoding_system_charset);
5285 #endif
5286   DEFSUBR (Fcoding_system_property);
5287
5288   DEFSUBR (Fcoding_category_list);
5289   DEFSUBR (Fset_coding_priority_list);
5290   DEFSUBR (Fcoding_priority_list);
5291   DEFSUBR (Fset_coding_category_system);
5292   DEFSUBR (Fcoding_category_system);
5293
5294   DEFSUBR (Fdetect_coding_region);
5295   DEFSUBR (Fdecode_coding_region);
5296   DEFSUBR (Fencode_coding_region);
5297 #ifdef MULE
5298   DEFSUBR (Fdecode_shift_jis_char);
5299   DEFSUBR (Fencode_shift_jis_char);
5300   DEFSUBR (Fdecode_big5_char);
5301   DEFSUBR (Fencode_big5_char);
5302 #endif /* MULE */
5303   defsymbol (&Qcoding_system_p, "coding-system-p");
5304   defsymbol (&Qno_conversion, "no-conversion");
5305   defsymbol (&Qraw_text, "raw-text");
5306 #ifdef MULE
5307   defsymbol (&Qbig5, "big5");
5308   defsymbol (&Qshift_jis, "shift-jis");
5309   defsymbol (&Qucs4, "ucs-4");
5310   defsymbol (&Qutf8, "utf-8");
5311   defsymbol (&Qccl, "ccl");
5312   defsymbol (&Qiso2022, "iso2022");
5313 #endif /* MULE */
5314   defsymbol (&Qmnemonic, "mnemonic");
5315   defsymbol (&Qeol_type, "eol-type");
5316   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5317   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5318
5319   defsymbol (&Qcr, "cr");
5320   defsymbol (&Qlf, "lf");
5321   defsymbol (&Qcrlf, "crlf");
5322   defsymbol (&Qeol_cr, "eol-cr");
5323   defsymbol (&Qeol_lf, "eol-lf");
5324   defsymbol (&Qeol_crlf, "eol-crlf");
5325 #ifdef MULE
5326   defsymbol (&Qcharset_g0, "charset-g0");
5327   defsymbol (&Qcharset_g1, "charset-g1");
5328   defsymbol (&Qcharset_g2, "charset-g2");
5329   defsymbol (&Qcharset_g3, "charset-g3");
5330   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5331   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5332   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5333   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5334   defsymbol (&Qno_iso6429, "no-iso6429");
5335   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5336   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5337
5338   defsymbol (&Qshort, "short");
5339   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5340   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5341   defsymbol (&Qseven, "seven");
5342   defsymbol (&Qlock_shift, "lock-shift");
5343   defsymbol (&Qescape_quoted, "escape-quoted");
5344 #endif /* MULE */
5345   defsymbol (&Qencode, "encode");
5346   defsymbol (&Qdecode, "decode");
5347
5348 #ifdef MULE
5349   defsymbol (&Qctext, "ctext");
5350   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5351              "shift-jis");
5352   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5353              "big5");
5354   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5355              "ucs-4");
5356   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5357              "utf-8");
5358   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5359              "iso-7");
5360   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5361              "iso-8-designate");
5362   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5363              "iso-8-1");
5364   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5365              "iso-8-2");
5366   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5367              "iso-lock-shift");
5368 #endif /* MULE */
5369   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5370              "no-conversion");
5371 }
5372
5373 void
5374 lstream_type_create_file_coding (void)
5375 {
5376   LSTREAM_HAS_METHOD (decoding, reader);
5377   LSTREAM_HAS_METHOD (decoding, writer);
5378   LSTREAM_HAS_METHOD (decoding, rewinder);
5379   LSTREAM_HAS_METHOD (decoding, seekable_p);
5380   LSTREAM_HAS_METHOD (decoding, flusher);
5381   LSTREAM_HAS_METHOD (decoding, closer);
5382   LSTREAM_HAS_METHOD (decoding, marker);
5383
5384   LSTREAM_HAS_METHOD (encoding, reader);
5385   LSTREAM_HAS_METHOD (encoding, writer);
5386   LSTREAM_HAS_METHOD (encoding, rewinder);
5387   LSTREAM_HAS_METHOD (encoding, seekable_p);
5388   LSTREAM_HAS_METHOD (encoding, flusher);
5389   LSTREAM_HAS_METHOD (encoding, closer);
5390   LSTREAM_HAS_METHOD (encoding, marker);
5391 }
5392
5393 void
5394 vars_of_file_coding (void)
5395 {
5396   int i;
5397
5398   /* Initialize to something reasonable ... */
5399   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5400     {
5401       coding_category_system[i] = Qnil;
5402       coding_category_by_priority[i] = i;
5403     }
5404
5405   Fprovide (intern ("file-coding"));
5406
5407   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5408 Coding system used for TTY keyboard input.
5409 Not used under a windowing system.
5410 */ );
5411   Vkeyboard_coding_system = Qnil;
5412
5413   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5414 Coding system used for TTY display output.
5415 Not used under a windowing system.
5416 */ );
5417   Vterminal_coding_system = Qnil;
5418
5419   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5420 Overriding coding system used when writing a file or process.
5421 You should *bind* this, not set it.  If this is non-nil, it specifies
5422 the coding system that will be used when a file or process is read
5423 in, and overrides `buffer-file-coding-system-for-read',
5424 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5425 this one for permanent changes to the environment.
5426 */ );
5427   Vcoding_system_for_read = Qnil;
5428
5429   DEFVAR_LISP ("coding-system-for-write",
5430                &Vcoding_system_for_write /*
5431 Overriding coding system used when writing a file or process.
5432 You should *bind* this, not set it.  If this is non-nil, it specifies
5433 the coding system that will be used when a file or process is wrote
5434 in, and overrides `buffer-file-coding-system',
5435 `write-region-pre-hook', etc.  Use those variables instead of this one
5436 for permanent changes to the environment.
5437 */ );
5438   Vcoding_system_for_write = Qnil;
5439
5440   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5441 Coding system used to convert pathnames when accessing files.
5442 */ );
5443   Vfile_name_coding_system = Qnil;
5444
5445   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5446 Non-nil means the buffer contents are regarded as multi-byte form
5447 of characters, not a binary code.  This affects the display, file I/O,
5448 and behaviors of various editing commands.
5449
5450 Setting this to nil does not do anything.
5451 */ );
5452   enable_multibyte_characters = 1;
5453 }
5454
5455 void
5456 complex_vars_of_file_coding (void)
5457 {
5458   staticpro (&Vcoding_system_hash_table);
5459   Vcoding_system_hash_table =
5460     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5461
5462   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5463
5464 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5465 {                                               \
5466   struct codesys_prop csp;                      \
5467   csp.sym = (Sym);                              \
5468   csp.prop_type = (Prop_Type);                  \
5469   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5470 } while (0)
5471
5472   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5473   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5474   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5475   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5476   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5477   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5478   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5479 #ifdef MULE
5480   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5481   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5482   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5483   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5484   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5485   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5486   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5487   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5488   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5489   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5490   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5491   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5492   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5493   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5494   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5495   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5496   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5497
5498   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5499   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5500 #endif /* MULE */
5501   /* Need to create this here or we're really screwed. */
5502   Fmake_coding_system
5503     (Qraw_text, Qno_conversion,
5504      build_string ("Raw text, which means it converts only line-break-codes."),
5505      list2 (Qmnemonic, build_string ("Raw")));
5506
5507   Fmake_coding_system
5508     (Qbinary, Qno_conversion,
5509      build_string ("Binary, which means it does not convert anything."),
5510      list4 (Qeol_type, Qlf,
5511             Qmnemonic, build_string ("Binary")));
5512
5513 #ifdef UTF2000
5514   Fmake_coding_system
5515     (Qutf8, Qutf8,
5516      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5517      list2 (Qmnemonic, build_string ("UTF8")));
5518 #endif
5519
5520   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5521
5522   /* Need this for bootstrapping */
5523   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5524     Fget_coding_system (Qraw_text);
5525
5526 #ifdef UTF2000
5527   coding_category_system[CODING_CATEGORY_UTF8]
5528    = Fget_coding_system (Qutf8);
5529 #endif
5530 }