git.chise.org Git - chise/xemacs-chise.git-/blob - src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #ifdef MULE
  34 #include "mule-ccl.h"
  35 #include "chartab.h"
  36 #endif
  37 #include "file-coding.h"
  38
  39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  40
  41 Lisp_Object Vkeyboard_coding_system;
  42 Lisp_Object Vterminal_coding_system;
  43 Lisp_Object Vcoding_system_for_read;
  44 Lisp_Object Vcoding_system_for_write;
  45 Lisp_Object Vfile_name_coding_system;
  46
  47 /* Table of symbols identifying each coding category. */
  48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  49
  50 /* Coding system currently associated with each coding category. */
  51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  52
  53 /* Table of all coding categories in decreasing order of priority.
  54    This describes a permutation of the possible coding categories. */
  55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  56
  57 Lisp_Object Qcoding_system_p;
  58
  59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  60 /* Qinternal in general.c */
  61
  62 Lisp_Object Qmnemonic, Qeol_type;
  63 Lisp_Object Qcr, Qcrlf, Qlf;
  64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  65 Lisp_Object Qpost_read_conversion;
  66 Lisp_Object Qpre_write_conversion;
  67
  68 #ifdef MULE
  69 Lisp_Object Qucs4, Qutf8;
  70 Lisp_Object Qbig5, Qshift_jis;
  71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  74 Lisp_Object Qno_iso6429;
  75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  76 Lisp_Object Qctext, Qescape_quoted;
  77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  78 #endif
  79 Lisp_Object Qencode, Qdecode;
  80
  81 Lisp_Object Vcoding_system_hash_table;
  82
  83 int enable_multibyte_characters;
  84
  85 #ifdef MULE
  86 /* Additional information used by the ISO2022 decoder and detector. */
  87 struct iso2022_decoder
  88 {
  89   /* CHARSET holds the character sets currently assigned to the G0
  90      through G3 variables.  It is initialized from the array
  91      INITIAL_CHARSET in CODESYS. */
  92   Lisp_Object charset[4];
  93
  94   /* Which registers are currently invoked into the left (GL) and
  95      right (GR) halves of the 8-bit encoding space? */
  96   int register_left, register_right;
  97
  98   /* ISO_ESC holds a value indicating part of an escape sequence
  99      that has already been seen. */
 100   enum iso_esc_flag esc;
 101
 102   /* This records the bytes we've seen so far in an escape sequence,
 103      in case the sequence is invalid (we spit out the bytes unchanged). */
 104   unsigned char esc_bytes[8];
 105
 106   /* Index for next byte to store in ISO escape sequence. */
 107   int esc_bytes_index;
 108
 109 #ifdef ENABLE_COMPOSITE_CHARS
 110   /* Stuff seen so far when composing a string. */
 111   unsigned_char_dynarr *composite_chars;
 112 #endif
 113
 114   /* If we saw an invalid designation sequence for a particular
 115      register, we flag it here and switch to ASCII.  The next time we
 116      see a valid designation for this register, we turn off the flag
 117      and do the designation normally, but pretend the sequence was
 118      invalid.  The effect of all this is that (most of the time) the
 119      escape sequences for both the switch to the unknown charset, and
 120      the switch back to the known charset, get inserted literally into
 121      the buffer and saved out as such.  The hope is that we can
 122      preserve the escape sequences so that the resulting written out
 123      file makes sense.  If we don't do any of this, the designation
 124      to the invalid charset will be preserved but that switch back
 125      to the known charset will probably get eaten because it was
 126      the same charset that was already present in the register. */
 127   unsigned char invalid_designated[4];
 128
 129   /* We try to do similar things as above for direction-switching
 130      sequences.  If we encountered a direction switch while an
 131      invalid designation was present, or an invalid designation
 132      just after a direction switch (i.e. no valid designation
 133      encountered yet), we insert the direction-switch escape
 134      sequence literally into the output stream, and later on
 135      insert the corresponding direction-restoring escape sequence
 136      literally also. */
 137   unsigned int switched_dir_and_no_valid_charset_yet :1;
 138   unsigned int invalid_switch_dir :1;
 139
 140   /* Tells the decoder to output the escape sequence literally
 141      even though it was valid.  Used in the games we play to
 142      avoid lossage when we encounter invalid designations. */
 143   unsigned int output_literally :1;
 144   /* We encountered a direction switch followed by an invalid
 145      designation.  We didn't output the direction switch
 146      literally because we didn't know about the invalid designation;
 147      but we have to do so now. */
 148   unsigned int output_direction_sequence :1;
 149 };
 150 #endif /* MULE */
 151 EXFUN (Fcopy_coding_system, 2);
 152 #ifdef MULE
 153 struct detection_state;
 154
 155 static void
 156 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
 157                      unsigned_char_dynarr *dst, unsigned int n);
 158
 159 static int detect_coding_sjis (struct detection_state *st,
 160                                CONST unsigned char *src,
 161                                unsigned int n);
 162 static void decode_coding_sjis (Lstream *decoding,
 163                                 CONST unsigned char *src,
 164                                 unsigned_char_dynarr *dst,
 165                                 unsigned int n);
 166 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 167                             unsigned_char_dynarr *dst, unsigned int *flags);
 168 void char_finish_shift_jis (struct encoding_stream *str,
 169                             unsigned_char_dynarr *dst, unsigned int *flags);
 170
 171 static int detect_coding_big5 (struct detection_state *st,
 172                                CONST unsigned char *src,
 173                                unsigned int n);
 174 static void decode_coding_big5 (Lstream *decoding,
 175                                 CONST unsigned char *src,
 176                                 unsigned_char_dynarr *dst, unsigned int n);
 177 static void encode_coding_big5 (Lstream *encoding,
 178                                 CONST unsigned char *src,
 179                                 unsigned_char_dynarr *dst, unsigned int n);
 180 static int detect_coding_ucs4 (struct detection_state *st,
 181                                CONST unsigned char *src,
 182                                unsigned int n);
 183 static void decode_coding_ucs4 (Lstream *decoding,
 184                                 CONST unsigned char *src,
 185                                 unsigned_char_dynarr *dst, unsigned int n);
 186 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 187                        unsigned_char_dynarr *dst, unsigned int *flags);
 188 void char_finish_ucs4 (struct encoding_stream *str,
 189                        unsigned_char_dynarr *dst, unsigned int *flags);
 190
 191 static int detect_coding_utf8 (struct detection_state *st,
 192                                CONST unsigned char *src,
 193                                unsigned int n);
 194 static void decode_coding_utf8 (Lstream *decoding,
 195                                 CONST unsigned char *src,
 196                                 unsigned_char_dynarr *dst, unsigned int n);
 197 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 198                        unsigned_char_dynarr *dst, unsigned int *flags);
 199 void char_finish_utf8 (struct encoding_stream *str,
 200                        unsigned_char_dynarr *dst, unsigned int *flags);
 201
 202 static int postprocess_iso2022_mask (int mask);
 203 static void reset_iso2022 (Lisp_Object coding_system,
 204                            struct iso2022_decoder *iso);
 205 static int detect_coding_iso2022 (struct detection_state *st,
 206                                   CONST unsigned char *src,
 207                                   unsigned int n);
 208 static void decode_coding_iso2022 (Lstream *decoding,
 209                                    CONST unsigned char *src,
 210                                    unsigned_char_dynarr *dst, unsigned int n);
 211 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 212                           unsigned_char_dynarr *dst, unsigned int *flags);
 213 void char_finish_iso2022 (struct encoding_stream *str,
 214                           unsigned_char_dynarr *dst, unsigned int *flags);
 215 #endif /* MULE */
 216 static void decode_coding_no_conversion (Lstream *decoding,
 217                                          CONST unsigned char *src,
 218                                          unsigned_char_dynarr *dst,
 219                                          unsigned int n);
 220 static void encode_coding_no_conversion (Lstream *encoding,
 221                                          CONST unsigned char *src,
 222                                          unsigned_char_dynarr *dst,
 223                                          unsigned int n);
 224 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 225                          unsigned_char_dynarr *dst, unsigned int n);
 226 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 227                          unsigned_char_dynarr *dst, unsigned int n);
 228
 229 typedef struct codesys_prop codesys_prop;
 230 struct codesys_prop
 231 {
 232   Lisp_Object sym;
 233   int prop_type;
 234 };
 235
 236 typedef struct
 237 {
 238   Dynarr_declare (codesys_prop);
 239 } codesys_prop_dynarr;
 240
 241 codesys_prop_dynarr *the_codesys_prop_dynarr;
 242
 243 enum codesys_prop_enum
 244 {
 245   CODESYS_PROP_ALL_OK,
 246   CODESYS_PROP_ISO2022,
 247   CODESYS_PROP_CCL
 248 };
 249
 250 \f
 251 /************************************************************************/
 252 /*                       Coding system functions                        */
 253 /************************************************************************/
 254
 255 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 256 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 257 static void finalize_coding_system (void *header, int for_disksave);
 258
 259 #ifdef MULE
 260 static const struct lrecord_description ccs_description_1[] = {
 261   { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
 262   { XD_END }
 263 };
 264
 265 static const struct struct_description ccs_description = {
 266   sizeof(charset_conversion_spec),
 267   ccs_description_1
 268 };
 269
 270 static const struct lrecord_description ccsd_description_1[] = {
 271   XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 272   { XD_END }
 273 };
 274
 275 static const struct struct_description ccsd_description = {
 276   sizeof(charset_conversion_spec_dynarr),
 277   ccsd_description_1
 278 };
 279 #endif
 280
 281 static const struct lrecord_description coding_system_description[] = {
 282   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
 283   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
 284   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
 285 #ifdef MULE
 286   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
 287   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 288   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 289   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
 290 #endif
 291   { XD_END }
 292 };
 293
 294 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 295                                mark_coding_system, print_coding_system,
 296                                finalize_coding_system,
 297                                0, 0, coding_system_description,
 298                                struct Lisp_Coding_System);
 299
 300 static Lisp_Object
 301 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 302 {
 303   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 304
 305   markobj (CODING_SYSTEM_NAME (codesys));
 306   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 307   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 308   markobj (CODING_SYSTEM_EOL_LF (codesys));
 309   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 310   markobj (CODING_SYSTEM_EOL_CR (codesys));
 311
 312   switch (CODING_SYSTEM_TYPE (codesys))
 313     {
 314 #ifdef MULE
 315       int i;
 316     case CODESYS_ISO2022:
 317       for (i = 0; i < 4; i++)
 318         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 319       if (codesys->iso2022.input_conv)
 320         {
 321           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 322             {
 323               struct charset_conversion_spec *ccs =
 324                 Dynarr_atp (codesys->iso2022.input_conv, i);
 325               markobj (ccs->from_charset);
 326               markobj (ccs->to_charset);
 327             }
 328         }
 329       if (codesys->iso2022.output_conv)
 330         {
 331           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 332             {
 333               struct charset_conversion_spec *ccs =
 334                 Dynarr_atp (codesys->iso2022.output_conv, i);
 335               markobj (ccs->from_charset);
 336               markobj (ccs->to_charset);
 337             }
 338         }
 339       break;
 340
 341     case CODESYS_CCL:
 342       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 343       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 344       break;
 345 #endif /* MULE */
 346     default:
 347       break;
 348     }
 349
 350   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 351   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 352 }
 353
 354 static void
 355 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 356                      int escapeflag)
 357 {
 358   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 359   if (print_readably)
 360     error ("printing unreadable object #<coding_system 0x%x>",
 361            c->header.uid);
 362
 363   write_c_string ("#<coding_system ", printcharfun);
 364   print_internal (c->name, printcharfun, 1);
 365   write_c_string (">", printcharfun);
 366 }
 367
 368 static void
 369 finalize_coding_system (void *header, int for_disksave)
 370 {
 371   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 372   /* Since coding systems never go away, this function is not
 373      necessary.  But it would be necessary if we changed things
 374      so that coding systems could go away. */
 375   if (!for_disksave) /* see comment in lstream.c */
 376     {
 377       switch (CODING_SYSTEM_TYPE (c))
 378         {
 379 #ifdef MULE
 380         case CODESYS_ISO2022:
 381           if (c->iso2022.input_conv)
 382             {
 383               Dynarr_free (c->iso2022.input_conv);
 384               c->iso2022.input_conv = 0;
 385             }
 386           if (c->iso2022.output_conv)
 387             {
 388               Dynarr_free (c->iso2022.output_conv);
 389               c->iso2022.output_conv = 0;
 390             }
 391           break;
 392 #endif /* MULE */
 393         default:
 394           break;
 395         }
 396     }
 397 }
 398
 399 static enum eol_type
 400 symbol_to_eol_type (Lisp_Object symbol)
 401 {
 402   CHECK_SYMBOL (symbol);
 403   if (NILP (symbol))      return EOL_AUTODETECT;
 404   if (EQ (symbol, Qlf))   return EOL_LF;
 405   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 406   if (EQ (symbol, Qcr))   return EOL_CR;
 407
 408   signal_simple_error ("Unrecognized eol type", symbol);
 409   return EOL_AUTODETECT; /* not reached */
 410 }
 411
 412 static Lisp_Object
 413 eol_type_to_symbol (enum eol_type type)
 414 {
 415   switch (type)
 416     {
 417     default: abort ();
 418     case EOL_LF:         return Qlf;
 419     case EOL_CRLF:       return Qcrlf;
 420     case EOL_CR:         return Qcr;
 421     case EOL_AUTODETECT: return Qnil;
 422     }
 423 }
 424
 425 static void
 426 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 427 {
 428   Lisp_Object codesys_obj;
 429   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 430   char *codesys_name = (char *) alloca (len + 7);
 431   int mlen = -1;
 432   char *codesys_mnemonic=0;
 433
 434   Lisp_Object codesys_name_sym, sub_codesys_obj;
 435
 436   /* kludge */
 437
 438   XSETCODING_SYSTEM (codesys_obj, codesys);
 439
 440   memcpy (codesys_name,
 441           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 442
 443   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 444     {
 445       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 446       codesys_mnemonic = (char *) alloca (mlen + 7);
 447       memcpy (codesys_mnemonic,
 448               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 449     }
 450
 451 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 452   strcpy (codesys_name + len, "-" op_sys);                                      \
 453   if (mlen != -1)                                                               \
 454     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 455   codesys_name_sym = intern (codesys_name);                                     \
 456   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 457   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 458   if (mlen != -1)                                                               \
 459     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 460       build_string (codesys_mnemonic);                                          \
 461   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 462 } while (0)
 463
 464   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 465   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 466   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 467 }
 468
 469 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 470 Return t if OBJECT is a coding system.
 471 A coding system is an object that defines how text containing multiple
 472 character sets is encoded into a stream of (typically 8-bit) bytes.
 473 The coding system is used to decode the stream into a series of
 474 characters (which may be from multiple charsets) when the text is read
 475 from a file or process, and is used to encode the text back into the
 476 same format when it is written out to a file or process.
 477
 478 For example, many ISO2022-compliant coding systems (such as Compound
 479 Text, which is used for inter-client data under the X Window System)
 480 use escape sequences to switch between different charsets -- Japanese
 481 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 482 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 483 `make-coding-system' for more information.
 484
 485 Coding systems are normally identified using a symbol, and the
 486 symbol is accepted in place of the actual coding system object whenever
 487 a coding system is called for. (This is similar to how faces work.)
 488 */
 489        (object))
 490 {
 491   return CODING_SYSTEMP (object) ? Qt : Qnil;
 492 }
 493
 494 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 495 Retrieve the coding system of the given name.
 496
 497 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 498 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 499 If there is no such coding system, nil is returned.  Otherwise the
 500 associated coding system object is returned.
 501 */
 502        (coding_system_or_name))
 503 {
 504   if (CODING_SYSTEMP (coding_system_or_name))
 505     return coding_system_or_name;
 506
 507   if (NILP (coding_system_or_name))
 508     coding_system_or_name = Qbinary;
 509   else
 510     CHECK_SYMBOL (coding_system_or_name);
 511
 512   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 513 }
 514
 515 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 516 Retrieve the coding system of the given name.
 517 Same as `find-coding-system' except that if there is no such
 518 coding system, an error is signaled instead of returning nil.
 519 */
 520        (name))
 521 {
 522   Lisp_Object coding_system = Ffind_coding_system (name);
 523
 524   if (NILP (coding_system))
 525     signal_simple_error ("No such coding system", name);
 526   return coding_system;
 527 }
 528
 529 /* We store the coding systems in hash tables with the names as the key and the
 530    actual coding system object as the value.  Occasionally we need to use them
 531    in a list format.  These routines provide us with that. */
 532 struct coding_system_list_closure
 533 {
 534   Lisp_Object *coding_system_list;
 535 };
 536
 537 static int
 538 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 539                                   void *coding_system_list_closure)
 540 {
 541   /* This function can GC */
 542   struct coding_system_list_closure *cscl =
 543     (struct coding_system_list_closure *) coding_system_list_closure;
 544   Lisp_Object *coding_system_list = cscl->coding_system_list;
 545
 546   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 547                                *coding_system_list);
 548   return 0;
 549 }
 550
 551 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 552 Return a list of the names of all defined coding systems.
 553 */
 554        ())
 555 {
 556   Lisp_Object coding_system_list = Qnil;
 557   struct gcpro gcpro1;
 558   struct coding_system_list_closure coding_system_list_closure;
 559
 560   GCPRO1 (coding_system_list);
 561   coding_system_list_closure.coding_system_list = &coding_system_list;
 562   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 563                  &coding_system_list_closure);
 564   UNGCPRO;
 565
 566   return coding_system_list;
 567 }
 568
 569 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 570 Return the name of the given coding system.
 571 */
 572        (coding_system))
 573 {
 574   coding_system = Fget_coding_system (coding_system);
 575   return XCODING_SYSTEM_NAME (coding_system);
 576 }
 577
 578 static Lisp_Coding_System *
 579 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 580 {
 581   Lisp_Coding_System *codesys =
 582     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 583
 584   zero_lcrecord (codesys);
 585   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 586   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 587   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 588   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 589   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 590   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 591   CODING_SYSTEM_TYPE     (codesys) = type;
 592   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 593 #ifdef MULE
 594   if (type == CODESYS_ISO2022)
 595     {
 596       int i;
 597       for (i = 0; i < 4; i++)
 598         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 599     }
 600   else if (type == CODESYS_CCL)
 601     {
 602       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 603       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 604     }
 605 #endif /* MULE */
 606   CODING_SYSTEM_NAME (codesys) = name;
 607
 608   return codesys;
 609 }
 610
 611 #ifdef MULE
 612 /* Given a list of charset conversion specs as specified in a Lisp
 613    program, parse it into STORE_HERE. */
 614
 615 static void
 616 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 617                                 Lisp_Object spec_list)
 618 {
 619   Lisp_Object rest;
 620
 621   EXTERNAL_LIST_LOOP (rest, spec_list)
 622     {
 623       Lisp_Object car = XCAR (rest);
 624       Lisp_Object from, to;
 625       struct charset_conversion_spec spec;
 626
 627       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 628         signal_simple_error ("Invalid charset conversion spec", car);
 629       from = Fget_charset (XCAR (car));
 630       to = Fget_charset (XCAR (XCDR (car)));
 631       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 632         signal_simple_error_2
 633           ("Attempted conversion between different charset types",
 634            from, to);
 635       spec.from_charset = from;
 636       spec.to_charset = to;
 637
 638       Dynarr_add (store_here, spec);
 639     }
 640 }
 641
 642 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 643    specs, return the equivalent as the Lisp programmer would see it.
 644
 645    If LOAD_HERE is 0, return Qnil. */
 646
 647 static Lisp_Object
 648 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 649 {
 650   int i;
 651   Lisp_Object result;
 652
 653   if (!load_here)
 654     return Qnil;
 655   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 656     {
 657       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 658       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 659     }
 660
 661   return Fnreverse (result);
 662 }
 663
 664 #endif /* MULE */
 665
 666 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 667 Register symbol NAME as a coding system.
 668
 669 TYPE describes the conversion method used and should be one of
 670
 671 nil or 'undecided
 672      Automatic conversion.  XEmacs attempts to detect the coding system
 673      used in the file.
 674 'no-conversion
 675      No conversion.  Use this for binary files and such.  On output,
 676      graphic characters that are not in ASCII or Latin-1 will be
 677      replaced by a ?. (For a no-conversion-encoded buffer, these
 678      characters will only be present if you explicitly insert them.)
 679 'shift-jis
 680      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 681 'ucs-4
 682      ISO 10646 UCS-4 encoding.
 683 'utf-8
 684      ISO 10646 UTF-8 encoding.
 685 'iso2022
 686      Any ISO2022-compliant encoding.  Among other things, this includes
 687      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 688      standard Unix encoding for Japanese and other languages), and
 689      Compound Text (the encoding used in X11).  You can specify more
 690      specific information about the conversion with the FLAGS argument.
 691 'big5
 692      Big5 (the encoding commonly used for Taiwanese).
 693 'ccl
 694      The conversion is performed using a user-written pseudo-code
 695      program.  CCL (Code Conversion Language) is the name of this
 696      pseudo-code.
 697 'internal
 698      Write out or read in the raw contents of the memory representing
 699      the buffer's text.  This is primarily useful for debugging
 700      purposes, and is only enabled when XEmacs has been compiled with
 701      DEBUG_XEMACS defined (via the --debug configure option).
 702      WARNING: Reading in a file using 'internal conversion can result
 703      in an internal inconsistency in the memory representing a
 704      buffer's text, which will produce unpredictable results and may
 705      cause XEmacs to crash.  Under normal circumstances you should
 706      never use 'internal conversion.
 707
 708 DOC-STRING is a string describing the coding system.
 709
 710 PROPS is a property list, describing the specific nature of the
 711 character set.  Recognized properties are:
 712
 713 'mnemonic
 714      String to be displayed in the modeline when this coding system is
 715      active.
 716
 717 'eol-type
 718      End-of-line conversion to be used.  It should be one of
 719
 720         nil
 721                 Automatically detect the end-of-line type (LF, CRLF,
 722                 or CR).  Also generate subsidiary coding systems named
 723                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 724                 identical to this coding system but have an EOL-TYPE
 725                 value of 'lf, 'crlf, and 'cr, respectively.
 726         'lf
 727                 The end of a line is marked externally using ASCII LF.
 728                 Since this is also the way that XEmacs represents an
 729                 end-of-line internally, specifying this option results
 730                 in no end-of-line conversion.  This is the standard
 731                 format for Unix text files.
 732         'crlf
 733                 The end of a line is marked externally using ASCII
 734                 CRLF.  This is the standard format for MS-DOS text
 735                 files.
 736         'cr
 737                 The end of a line is marked externally using ASCII CR.
 738                 This is the standard format for Macintosh text files.
 739         t
 740                 Automatically detect the end-of-line type but do not
 741                 generate subsidiary coding systems.  (This value is
 742                 converted to nil when stored internally, and
 743                 `coding-system-property' will return nil.)
 744
 745 'post-read-conversion
 746      Function called after a file has been read in, to perform the
 747      decoding.  Called with two arguments, BEG and END, denoting
 748      a region of the current buffer to be decoded.
 749
 750 'pre-write-conversion
 751      Function called before a file is written out, to perform the
 752      encoding.  Called with two arguments, BEG and END, denoting
 753      a region of the current buffer to be encoded.
 754
 755
 756 The following additional properties are recognized if TYPE is 'iso2022:
 757
 758 'charset-g0
 759 'charset-g1
 760 'charset-g2
 761 'charset-g3
 762      The character set initially designated to the G0 - G3 registers.
 763      The value should be one of
 764
 765           -- A charset object (designate that character set)
 766           -- nil (do not ever use this register)
 767           -- t (no character set is initially designated to
 768                 the register, but may be later on; this automatically
 769                 sets the corresponding `force-g*-on-output' property)
 770
 771 'force-g0-on-output
 772 'force-g1-on-output
 773 'force-g2-on-output
 774 'force-g2-on-output
 775      If non-nil, send an explicit designation sequence on output before
 776      using the specified register.
 777
 778 'short
 779      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 780      "ESC $ B" on output in place of the full designation sequences
 781      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 782
 783 'no-ascii-eol
 784      If non-nil, don't designate ASCII to G0 at each end of line on output.
 785      Setting this to non-nil also suppresses other state-resetting that
 786      normally happens at the end of a line.
 787
 788 'no-ascii-cntl
 789      If non-nil, don't designate ASCII to G0 before control chars on output.
 790
 791 'seven
 792      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 793      environment.
 794
 795 'lock-shift
 796      If non-nil, use locking-shift (SO/SI) instead of single-shift
 797      or designation by escape sequence.
 798
 799 'no-iso6429
 800      If non-nil, don't use ISO6429's direction specification.
 801
 802 'escape-quoted
 803      If non-nil, literal control characters that are the same as
 804      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 805      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 806      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 807      so that they can be properly distinguished from an escape sequence.
 808      (Note that doing this results in a non-portable encoding.) This
 809      encoding flag is used for byte-compiled files.  Note that ESC
 810      is a good choice for a quoting character because there are no
 811      escape sequences whose second byte is a character from the Control-0
 812      or Control-1 character sets; this is explicitly disallowed by the
 813      ISO2022 standard.
 814
 815 'input-charset-conversion
 816      A list of conversion specifications, specifying conversion of
 817      characters in one charset to another when decoding is performed.
 818      Each specification is a list of two elements: the source charset,
 819      and the destination charset.
 820
 821 'output-charset-conversion
 822      A list of conversion specifications, specifying conversion of
 823      characters in one charset to another when encoding is performed.
 824      The form of each specification is the same as for
 825      'input-charset-conversion.
 826
 827
 828 The following additional properties are recognized (and required)
 829 if TYPE is 'ccl:
 830
 831 'decode
 832      CCL program used for decoding (converting to internal format).
 833
 834 'encode
 835      CCL program used for encoding (converting to external format).
 836 */
 837        (name, type, doc_string, props))
 838 {
 839   Lisp_Coding_System *codesys;
 840   Lisp_Object rest, key, value;
 841   enum coding_system_type ty;
 842   int need_to_setup_eol_systems = 1;
 843
 844   /* Convert type to constant */
 845   if (NILP (type) || EQ (type, Qundecided))
 846                                       { ty = CODESYS_AUTODETECT; }
 847 #ifdef MULE
 848   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 849   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 850   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 851   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 852   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 853   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 854 #endif
 855   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 856 #ifdef DEBUG_XEMACS
 857   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 858 #endif
 859   else
 860     signal_simple_error ("Invalid coding system type", type);
 861
 862   CHECK_SYMBOL (name);
 863
 864   codesys = allocate_coding_system (ty, name);
 865
 866   if (NILP (doc_string))
 867     doc_string = build_string ("");
 868   else
 869     CHECK_STRING (doc_string);
 870   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 871
 872 #ifdef UTF2000
 873   if (ty == CODESYS_NO_CONVERSION)
 874     codesys->fixed.size = 1;
 875 #endif
 876   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 877     {
 878       if (EQ (key, Qmnemonic))
 879         {
 880           if (!NILP (value))
 881             CHECK_STRING (value);
 882           CODING_SYSTEM_MNEMONIC (codesys) = value;
 883         }
 884
 885       else if (EQ (key, Qeol_type))
 886         {
 887           need_to_setup_eol_systems = NILP (value);
 888           if (EQ (value, Qt))
 889             value = Qnil;
 890           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 891         }
 892
 893       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 894       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 895 #ifdef MULE
 896       else if (ty == CODESYS_ISO2022)
 897         {
 898 #define FROB_INITIAL_CHARSET(charset_num) \
 899   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 900     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 901
 902           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 903           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 904           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 905           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 906
 907 #define FROB_FORCE_CHARSET(charset_num) \
 908   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 909
 910           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 911           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 912           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 913           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 914
 915 #define FROB_BOOLEAN_PROPERTY(prop) \
 916   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 917
 918           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 919           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 920           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 921           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 922           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 923           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 924           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 925
 926           else if (EQ (key, Qinput_charset_conversion))
 927             {
 928               codesys->iso2022.input_conv =
 929                 Dynarr_new (charset_conversion_spec);
 930               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 931                                               value);
 932             }
 933           else if (EQ (key, Qoutput_charset_conversion))
 934             {
 935               codesys->iso2022.output_conv =
 936                 Dynarr_new (charset_conversion_spec);
 937               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 938                                               value);
 939             }
 940           else
 941             signal_simple_error ("Unrecognized property", key);
 942         }
 943       else if (EQ (type, Qccl))
 944         {
 945           if (EQ (key, Qdecode))
 946             {
 947               CHECK_VECTOR (value);
 948               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 949             }
 950           else if (EQ (key, Qencode))
 951             {
 952               CHECK_VECTOR (value);
 953               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 954             }
 955           else
 956             signal_simple_error ("Unrecognized property", key);
 957         }
 958 #endif /* MULE */
 959       else
 960         signal_simple_error ("Unrecognized property", key);
 961     }
 962
 963   if (need_to_setup_eol_systems)
 964     setup_eol_coding_systems (codesys);
 965
 966   {
 967     Lisp_Object codesys_obj;
 968     XSETCODING_SYSTEM (codesys_obj, codesys);
 969     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 970     return codesys_obj;
 971   }
 972 }
 973
 974 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 975 Copy OLD-CODING-SYSTEM to NEW-NAME.
 976 If NEW-NAME does not name an existing coding system, a new one will
 977 be created.
 978 */
 979        (old_coding_system, new_name))
 980 {
 981   Lisp_Object new_coding_system;
 982   old_coding_system = Fget_coding_system (old_coding_system);
 983   new_coding_system = Ffind_coding_system (new_name);
 984   if (NILP (new_coding_system))
 985     {
 986       XSETCODING_SYSTEM (new_coding_system,
 987                          allocate_coding_system
 988                          (XCODING_SYSTEM_TYPE (old_coding_system),
 989                           new_name));
 990       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 991     }
 992
 993   {
 994     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 995     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 996     memcpy (((char *) to  ) + sizeof (to->header),
 997             ((char *) from) + sizeof (from->header),
 998             sizeof (*from) - sizeof (from->header));
 999     to->name = new_name;
1000   }
1001   return new_coding_system;
1002 }
1003
1004 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1005 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1006 */
1007        (alias, coding_system))
1008 {
1009   CHECK_SYMBOL (alias);
1010   if (!NILP (Ffind_coding_system (alias)))
1011     signal_simple_error ("Symbol already names a coding system", alias);
1012   coding_system = Fget_coding_system (coding_system);
1013   Fputhash (alias, coding_system, Vcoding_system_hash_table);
1014
1015   /* Set up aliases for subsidiaries. */
1016   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1017     {
1018       Lisp_Object str;
1019       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1020 #define FROB(type, name)                                                        \
1021       do {                                                                      \
1022         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
1023         if (!NILP (subsidiary))                                                 \
1024           Fdefine_coding_system_alias                                           \
1025             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
1026       } while (0)
1027       FROB (LF,   "-unix");
1028       FROB (CRLF, "-dos");
1029       FROB (CR,   "-mac");
1030 #undef FROB
1031     }
1032   /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1033      but it doesn't look intentional, so I'd rather return something
1034      meaningful or nothing at all. */
1035   return Qnil;
1036 }
1037
1038 static Lisp_Object
1039 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1040 {
1041   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1042   Lisp_Object new_coding_system;
1043
1044   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1045     return coding_system;
1046
1047   switch (type)
1048     {
1049     case EOL_AUTODETECT: return coding_system;
1050     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1051     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1052     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1053     default:       abort ();
1054     }
1055
1056   return NILP (new_coding_system) ? coding_system : new_coding_system;
1057 }
1058
1059 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1060 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1061 */
1062        (coding_system, eol_type))
1063 {
1064   coding_system = Fget_coding_system (coding_system);
1065
1066   return subsidiary_coding_system (coding_system,
1067                                    symbol_to_eol_type (eol_type));
1068 }
1069
1070 \f
1071 /************************************************************************/
1072 /*                         Coding system accessors                      */
1073 /************************************************************************/
1074
1075 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1076 Return the doc string for CODING-SYSTEM.
1077 */
1078        (coding_system))
1079 {
1080   coding_system = Fget_coding_system (coding_system);
1081   return XCODING_SYSTEM_DOC_STRING (coding_system);
1082 }
1083
1084 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1085 Return the type of CODING-SYSTEM.
1086 */
1087        (coding_system))
1088 {
1089   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1090     {
1091     default: abort ();
1092     case CODESYS_AUTODETECT:    return Qundecided;
1093 #ifdef MULE
1094     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1095     case CODESYS_ISO2022:       return Qiso2022;
1096     case CODESYS_BIG5:          return Qbig5;
1097     case CODESYS_UCS4:          return Qucs4;
1098     case CODESYS_UTF8:          return Qutf8;
1099     case CODESYS_CCL:           return Qccl;
1100 #endif
1101     case CODESYS_NO_CONVERSION: return Qno_conversion;
1102 #ifdef DEBUG_XEMACS
1103     case CODESYS_INTERNAL:      return Qinternal;
1104 #endif
1105     }
1106 }
1107
1108 #ifdef MULE
1109 static
1110 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1111 {
1112   Lisp_Object cs
1113     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1114
1115   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1116 }
1117
1118 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1119 Return initial charset of CODING-SYSTEM designated to GNUM.
1120 GNUM allows 0 .. 3.
1121 */
1122        (coding_system, gnum))
1123 {
1124   coding_system = Fget_coding_system (coding_system);
1125   CHECK_INT (gnum);
1126
1127   return coding_system_charset (coding_system, XINT (gnum));
1128 }
1129 #endif /* MULE */
1130
1131 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1132 Return the PROP property of CODING-SYSTEM.
1133 */
1134        (coding_system, prop))
1135 {
1136   int i, ok = 0;
1137   enum coding_system_type type;
1138
1139   coding_system = Fget_coding_system (coding_system);
1140   CHECK_SYMBOL (prop);
1141   type = XCODING_SYSTEM_TYPE (coding_system);
1142
1143   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1144     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1145       {
1146         ok = 1;
1147         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1148           {
1149           case CODESYS_PROP_ALL_OK:
1150             break;
1151 #ifdef MULE
1152           case CODESYS_PROP_ISO2022:
1153             if (type != CODESYS_ISO2022)
1154               signal_simple_error
1155                 ("Property only valid in ISO2022 coding systems",
1156                  prop);
1157             break;
1158
1159           case CODESYS_PROP_CCL:
1160             if (type != CODESYS_CCL)
1161               signal_simple_error
1162                 ("Property only valid in CCL coding systems",
1163                  prop);
1164             break;
1165 #endif /* MULE */
1166           default:
1167             abort ();
1168           }
1169       }
1170
1171   if (!ok)
1172     signal_simple_error ("Unrecognized property", prop);
1173
1174   if (EQ (prop, Qname))
1175     return XCODING_SYSTEM_NAME (coding_system);
1176   else if (EQ (prop, Qtype))
1177     return Fcoding_system_type (coding_system);
1178   else if (EQ (prop, Qdoc_string))
1179     return XCODING_SYSTEM_DOC_STRING (coding_system);
1180   else if (EQ (prop, Qmnemonic))
1181     return XCODING_SYSTEM_MNEMONIC (coding_system);
1182   else if (EQ (prop, Qeol_type))
1183     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1184   else if (EQ (prop, Qeol_lf))
1185     return XCODING_SYSTEM_EOL_LF (coding_system);
1186   else if (EQ (prop, Qeol_crlf))
1187     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1188   else if (EQ (prop, Qeol_cr))
1189     return XCODING_SYSTEM_EOL_CR (coding_system);
1190   else if (EQ (prop, Qpost_read_conversion))
1191     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1192   else if (EQ (prop, Qpre_write_conversion))
1193     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1194 #ifdef MULE
1195   else if (type == CODESYS_ISO2022)
1196     {
1197       if (EQ (prop, Qcharset_g0))
1198         return coding_system_charset (coding_system, 0);
1199       else if (EQ (prop, Qcharset_g1))
1200         return coding_system_charset (coding_system, 1);
1201       else if (EQ (prop, Qcharset_g2))
1202         return coding_system_charset (coding_system, 2);
1203       else if (EQ (prop, Qcharset_g3))
1204         return coding_system_charset (coding_system, 3);
1205
1206 #define FORCE_CHARSET(charset_num) \
1207   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1208    (coding_system, charset_num) ? Qt : Qnil)
1209
1210       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1211       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1212       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1213       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1214
1215 #define LISP_BOOLEAN(prop) \
1216   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1217
1218       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1219       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1220       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1221       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1222       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1223       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1224       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1225
1226       else if (EQ (prop, Qinput_charset_conversion))
1227         return
1228           unparse_charset_conversion_specs
1229             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1230       else if (EQ (prop, Qoutput_charset_conversion))
1231         return
1232           unparse_charset_conversion_specs
1233             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1234       else
1235         abort ();
1236     }
1237   else if (type == CODESYS_CCL)
1238     {
1239       if (EQ (prop, Qdecode))
1240         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1241       else if (EQ (prop, Qencode))
1242         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1243       else
1244         abort ();
1245     }
1246 #endif /* MULE */
1247   else
1248     abort ();
1249
1250   return Qnil; /* not reached */
1251 }
1252
1253 \f
1254 /************************************************************************/
1255 /*                       Coding category functions                      */
1256 /************************************************************************/
1257
1258 static int
1259 decode_coding_category (Lisp_Object symbol)
1260 {
1261   int i;
1262
1263   CHECK_SYMBOL (symbol);
1264   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1265     if (EQ (coding_category_symbol[i], symbol))
1266       return i;
1267
1268   signal_simple_error ("Unrecognized coding category", symbol);
1269   return 0; /* not reached */
1270 }
1271
1272 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1273 Return a list of all recognized coding categories.
1274 */
1275        ())
1276 {
1277   int i;
1278   Lisp_Object list = Qnil;
1279
1280   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1281     list = Fcons (coding_category_symbol[i], list);
1282   return list;
1283 }
1284
1285 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1286 Change the priority order of the coding categories.
1287 LIST should be list of coding categories, in descending order of
1288 priority.  Unspecified coding categories will be lower in priority
1289 than all specified ones, in the same relative order they were in
1290 previously.
1291 */
1292        (list))
1293 {
1294   int category_to_priority[CODING_CATEGORY_LAST + 1];
1295   int i, j;
1296   Lisp_Object rest;
1297
1298   /* First generate a list that maps coding categories to priorities. */
1299
1300   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1301     category_to_priority[i] = -1;
1302
1303   /* Highest priority comes from the specified list. */
1304   i = 0;
1305   EXTERNAL_LIST_LOOP (rest, list)
1306     {
1307       int cat = decode_coding_category (XCAR (rest));
1308
1309       if (category_to_priority[cat] >= 0)
1310         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1311       category_to_priority[cat] = i++;
1312     }
1313
1314   /* Now go through the existing categories by priority to retrieve
1315      the categories not yet specified and preserve their priority
1316      order. */
1317   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1318     {
1319       int cat = coding_category_by_priority[j];
1320       if (category_to_priority[cat] < 0)
1321         category_to_priority[cat] = i++;
1322     }
1323
1324   /* Now we need to construct the inverse of the mapping we just
1325      constructed. */
1326
1327   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1328     coding_category_by_priority[category_to_priority[i]] = i;
1329
1330   /* Phew!  That was confusing. */
1331   return Qnil;
1332 }
1333
1334 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1335 Return a list of coding categories in descending order of priority.
1336 */
1337        ())
1338 {
1339   int i;
1340   Lisp_Object list = Qnil;
1341
1342   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1343     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1344                   list);
1345   return list;
1346 }
1347
1348 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1349 Change the coding system associated with a coding category.
1350 */
1351        (coding_category, coding_system))
1352 {
1353   int cat = decode_coding_category (coding_category);
1354
1355   coding_system = Fget_coding_system (coding_system);
1356   coding_category_system[cat] = coding_system;
1357   return Qnil;
1358 }
1359
1360 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1361 Return the coding system associated with a coding category.
1362 */
1363        (coding_category))
1364 {
1365   int cat = decode_coding_category (coding_category);
1366   Lisp_Object sys = coding_category_system[cat];
1367
1368   if (!NILP (sys))
1369     return XCODING_SYSTEM_NAME (sys);
1370   return Qnil;
1371 }
1372
1373 \f
1374 /************************************************************************/
1375 /*                     Detecting the encoding of data                   */
1376 /************************************************************************/
1377
1378 struct detection_state
1379 {
1380   enum eol_type eol_type;
1381   int seen_non_ascii;
1382   int mask;
1383 #ifdef MULE
1384   struct
1385     {
1386       int mask;
1387       int in_second_byte;
1388     }
1389   big5;
1390
1391   struct
1392     {
1393       int mask;
1394       int in_second_byte;
1395     }
1396   shift_jis;
1397
1398   struct
1399     {
1400       int mask;
1401       int in_byte;
1402   }
1403   ucs4;
1404
1405   struct
1406     {
1407       int mask;
1408       int in_byte;
1409     }
1410   utf8;
1411
1412   struct
1413     {
1414       int mask;
1415       int initted;
1416       struct iso2022_decoder iso;
1417       unsigned int flags;
1418       int high_byte_count;
1419       unsigned int saw_single_shift:1;
1420     }
1421   iso2022;
1422 #endif
1423   struct
1424     {
1425       int seen_anything;
1426       int just_saw_cr;
1427     }
1428   eol;
1429 };
1430
1431 static int
1432 acceptable_control_char_p (int c)
1433 {
1434   switch (c)
1435     {
1436       /* Allow and ignore control characters that you might
1437          reasonably see in a text file */
1438     case '\r':
1439     case '\n':
1440     case '\t':
1441     case  7: /* bell */
1442     case  8: /* backspace */
1443     case 11: /* vertical tab */
1444     case 12: /* form feed */
1445     case 26: /* MS-DOS C-z junk */
1446     case 31: /* '^_' -- for info */
1447       return 1;
1448     default:
1449       return 0;
1450     }
1451 }
1452
1453 static int
1454 mask_has_at_most_one_bit_p (int mask)
1455 {
1456   /* Perhaps the only thing useful you learn from intensive Microsoft
1457      technical interviews */
1458   return (mask & (mask - 1)) == 0;
1459 }
1460
1461 static enum eol_type
1462 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1463                  unsigned int n)
1464 {
1465   int c;
1466
1467   while (n--)
1468     {
1469       c = *src++;
1470       if (c == '\r')
1471         st->eol.just_saw_cr = 1;
1472       else
1473         {
1474           if (c == '\n')
1475             {
1476               if (st->eol.just_saw_cr)
1477                 return EOL_CRLF;
1478               else if (st->eol.seen_anything)
1479                 return EOL_LF;
1480             }
1481           else if (st->eol.just_saw_cr)
1482             return EOL_CR;
1483           st->eol.just_saw_cr = 0;
1484         }
1485       st->eol.seen_anything = 1;
1486     }
1487
1488   return EOL_AUTODETECT;
1489 }
1490
1491 /* Attempt to determine the encoding and EOL type of the given text.
1492    Before calling this function for the first type, you must initialize
1493    st->eol_type as appropriate and initialize st->mask to ~0.
1494
1495    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1496    not yet known.
1497
1498    st->mask holds the determined coding category mask, or ~0 if only
1499    ASCII has been seen so far.
1500
1501    Returns:
1502
1503    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1504         is present in st->mask
1505    1 == definitive answers are here for both st->eol_type and st->mask
1506 */
1507
1508 static int
1509 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1510                     unsigned int n, int just_do_eol)
1511 {
1512   int c;
1513
1514   if (st->eol_type == EOL_AUTODETECT)
1515     st->eol_type = detect_eol_type (st, src, n);
1516
1517   if (just_do_eol)
1518     return st->eol_type != EOL_AUTODETECT;
1519
1520   if (!st->seen_non_ascii)
1521     {
1522       for (; n; n--, src++)
1523         {
1524           c = *src;
1525           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1526             {
1527               st->seen_non_ascii = 1;
1528 #ifdef MULE
1529               st->shift_jis.mask = ~0;
1530               st->big5.mask = ~0;
1531               st->ucs4.mask = ~0;
1532               st->utf8.mask = ~0;
1533               st->iso2022.mask = ~0;
1534 #endif
1535               break;
1536             }
1537         }
1538     }
1539
1540   if (!n)
1541     return 0;
1542 #ifdef MULE
1543   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1544     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1545   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1546     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1547   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1548     st->big5.mask = detect_coding_big5 (st, src, n);
1549   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1550     st->utf8.mask = detect_coding_utf8 (st, src, n);
1551   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1552     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1553
1554   st->mask
1555     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1556     | st->utf8.mask | st->ucs4.mask;
1557 #endif
1558   {
1559     int retval = mask_has_at_most_one_bit_p (st->mask);
1560     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1561     return retval && st->eol_type != EOL_AUTODETECT;
1562   }
1563 }
1564
1565 static Lisp_Object
1566 coding_system_from_mask (int mask)
1567 {
1568   if (mask == ~0)
1569     {
1570       /* If the file was entirely or basically ASCII, use the
1571          default value of `buffer-file-coding-system'. */
1572       Lisp_Object retval =
1573         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1574       if (!NILP (retval))
1575         {
1576           retval = Ffind_coding_system (retval);
1577           if (NILP (retval))
1578             {
1579               warn_when_safe
1580                 (Qbad_variable, Qwarning,
1581                  "Invalid `default-buffer-file-coding-system', set to nil");
1582               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1583             }
1584         }
1585       if (NILP (retval))
1586         retval = Fget_coding_system (Qraw_text);
1587       return retval;
1588     }
1589   else
1590     {
1591       int i;
1592       int cat = -1;
1593 #ifdef MULE
1594       mask = postprocess_iso2022_mask (mask);
1595 #endif
1596       /* Look through the coding categories by priority and find
1597          the first one that is allowed. */
1598       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1599         {
1600           cat = coding_category_by_priority[i];
1601           if ((mask & (1 << cat)) &&
1602               !NILP (coding_category_system[cat]))
1603             break;
1604         }
1605       if (cat >= 0)
1606         return coding_category_system[cat];
1607       else
1608         return Fget_coding_system (Qraw_text);
1609     }
1610 }
1611
1612 /* Given a seekable read stream and potential coding system and EOL type
1613    as specified, do any autodetection that is called for.  If the
1614    coding system and/or EOL type are not autodetect, they will be left
1615    alone; but this function will never return an autodetect coding system
1616    or EOL type.
1617
1618    This function does not automatically fetch subsidiary coding systems;
1619    that should be unnecessary with the explicit eol-type argument. */
1620
1621 void
1622 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1623                               enum eol_type *eol_type_in_out)
1624 {
1625   struct detection_state decst;
1626
1627   if (*eol_type_in_out == EOL_AUTODETECT)
1628     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1629
1630   xzero (decst);
1631   decst.eol_type = *eol_type_in_out;
1632   decst.mask = ~0;
1633
1634   /* If autodetection is called for, do it now. */
1635   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1636       *eol_type_in_out == EOL_AUTODETECT)
1637     {
1638       unsigned char random_buffer[4096];
1639       int nread;
1640       Lisp_Object coding_system = Qnil;
1641
1642       nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1643       if (nread)
1644         {
1645           unsigned char *cp = random_buffer;
1646
1647           while (cp < random_buffer + nread)
1648             {
1649               if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1650                   (*cp++ == 'o') && (cp < random_buffer + nread) &&
1651                   (*cp++ == 'd') && (cp < random_buffer + nread) &&
1652                   (*cp++ == 'i') && (cp < random_buffer + nread) &&
1653                   (*cp++ == 'n') && (cp < random_buffer + nread) &&
1654                   (*cp++ == 'g') && (cp < random_buffer + nread) &&
1655                   (*cp++ == ':') && (cp < random_buffer + nread))
1656                 {
1657                   unsigned char coding_system_name[4096 - 6];
1658                   unsigned char *np = coding_system_name;
1659
1660                   while ( (cp < random_buffer + nread)
1661                           && ((*cp == ' ') || (*cp == '\t')) )
1662                     {
1663                       cp++;
1664                     }
1665                   while ( (cp < random_buffer + nread) &&
1666                           (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1667                     {
1668                       *np++ = *cp++;
1669                     }
1670                   *np = 0;
1671                   coding_system
1672                     = Ffind_coding_system (intern (coding_system_name));
1673                   break;
1674                 }
1675             }
1676           if (EQ(coding_system, Qnil))
1677             do{
1678               if (detect_coding_type (&decst, random_buffer, nread,
1679                                       XCODING_SYSTEM_TYPE (*codesys_in_out)
1680                                       != CODESYS_AUTODETECT))
1681                 break;
1682               nread = Lstream_read (stream,
1683                                     random_buffer, sizeof (random_buffer));
1684               if (!nread)
1685                 break;
1686             } while(1);
1687         }
1688       *eol_type_in_out = decst.eol_type;
1689       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1690         {
1691           if (EQ(coding_system, Qnil))
1692             *codesys_in_out = coding_system_from_mask (decst.mask);
1693           else
1694             *codesys_in_out = coding_system;
1695         }
1696     }
1697   /* If we absolutely can't determine the EOL type, just assume LF. */
1698   if (*eol_type_in_out == EOL_AUTODETECT)
1699     *eol_type_in_out = EOL_LF;
1700
1701   Lstream_rewind (stream);
1702 }
1703
1704 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1705 Detect coding system of the text in the region between START and END.
1706 Returned a list of possible coding systems ordered by priority.
1707 If only ASCII characters are found, it returns 'undecided or one of
1708 its subsidiary coding systems according to a detected end-of-line
1709 type.  Optional arg BUFFER defaults to the current buffer.
1710 */
1711        (start, end, buffer))
1712 {
1713   Lisp_Object val = Qnil;
1714   struct buffer *buf = decode_buffer (buffer, 0);
1715   Bufpos b, e;
1716   Lisp_Object instream, lb_instream;
1717   Lstream *istr, *lb_istr;
1718   struct detection_state decst;
1719   struct gcpro gcpro1, gcpro2;
1720
1721   get_buffer_range_char (buf, start, end, &b, &e, 0);
1722   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1723   lb_istr = XLSTREAM (lb_instream);
1724   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1725   istr = XLSTREAM (instream);
1726   GCPRO2 (instream, lb_instream);
1727   xzero (decst);
1728   decst.eol_type = EOL_AUTODETECT;
1729   decst.mask = ~0;
1730   while (1)
1731     {
1732       unsigned char random_buffer[4096];
1733       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1734
1735       if (!nread)
1736         break;
1737       if (detect_coding_type (&decst, random_buffer, nread, 0))
1738         break;
1739     }
1740
1741   if (decst.mask == ~0)
1742     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1743                                     decst.eol_type);
1744   else
1745     {
1746       int i;
1747
1748       val = Qnil;
1749 #ifdef MULE
1750       decst.mask = postprocess_iso2022_mask (decst.mask);
1751 #endif
1752       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1753         {
1754           int sys = coding_category_by_priority[i];
1755           if (decst.mask & (1 << sys))
1756             {
1757               Lisp_Object codesys = coding_category_system[sys];
1758               if (!NILP (codesys))
1759                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1760               val = Fcons (codesys, val);
1761             }
1762         }
1763     }
1764   Lstream_close (istr);
1765   UNGCPRO;
1766   Lstream_delete (istr);
1767   Lstream_delete (lb_istr);
1768   return val;
1769 }
1770
1771 \f
1772 /************************************************************************/
1773 /*           Converting to internal Mule format ("decoding")            */
1774 /************************************************************************/
1775
1776 /* A decoding stream is a stream used for decoding text (i.e.
1777    converting from some external format to internal format).
1778    The decoding-stream object keeps track of the actual coding
1779    stream, the stream that is at the other end, and data that
1780    needs to be persistent across the lifetime of the stream. */
1781
1782 /* Handle the EOL stuff related to just-read-in character C.
1783    EOL_TYPE is the EOL type of the coding stream.
1784    FLAGS is the current value of FLAGS in the coding stream, and may
1785    be modified by this macro.  (The macro only looks at the
1786    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1787    bytes are to be written.  You need to also define a local goto
1788    label "label_continue_loop" that is at the end of the main
1789    character-reading loop.
1790
1791    If C is a CR character, then this macro handles it entirely and
1792    jumps to label_continue_loop.  Otherwise, this macro does not add
1793    anything to DST, and continues normally.  You should continue
1794    processing C normally after this macro. */
1795
1796 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1797 do {                                                            \
1798   if (c == '\r')                                                \
1799     {                                                           \
1800       if (eol_type == EOL_CR)                                   \
1801         Dynarr_add (dst, '\n');                                 \
1802       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1803         Dynarr_add (dst, c);                                    \
1804       else                                                      \
1805         flags |= CODING_STATE_CR;                               \
1806       goto label_continue_loop;                                 \
1807     }                                                           \
1808   else if (flags & CODING_STATE_CR)                             \
1809     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1810       if (c != '\n')                                            \
1811         Dynarr_add (dst, '\r');                                 \
1812       flags &= ~CODING_STATE_CR;                                \
1813     }                                                           \
1814 } while (0)
1815
1816 /* C should be a binary character in the range 0 - 255; convert
1817    to internal format and add to Dynarr DST. */
1818
1819 #ifdef UTF2000
1820 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1821 do {                                            \
1822   if (BYTE_ASCII_P (c))                         \
1823     Dynarr_add (dst, c);                        \
1824   else                                          \
1825     {                                           \
1826       Dynarr_add (dst, (c >> 6) | 0xc0);        \
1827       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
1828     }                                           \
1829 } while (0)
1830
1831 INLINE void
1832 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1833 {
1834   if ( c <= 0x7f )
1835     {
1836       Dynarr_add (dst, c);
1837     }
1838   else if ( c <= 0x7ff )
1839     {
1840       Dynarr_add (dst, (c >> 6) | 0xc0);
1841       Dynarr_add (dst, (c & 0x3f) | 0x80);
1842     }
1843   else if ( c <= 0xffff )
1844     {
1845       Dynarr_add (dst,  (c >> 12) | 0xe0);
1846       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1847       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1848     }
1849   else if ( c <= 0x1fffff )
1850     {
1851       Dynarr_add (dst,  (c >> 18) | 0xf0);
1852       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1853       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1854       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1855     }
1856   else if ( c <= 0x3ffffff )
1857     {
1858       Dynarr_add (dst,  (c >> 24) | 0xf8);
1859       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1860       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1861       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1862       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1863     }
1864   else
1865     {
1866       Dynarr_add (dst,  (c >> 30) | 0xfc);
1867       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1868       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1869       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1870       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1871       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1872     }
1873 }
1874 #else
1875 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1876 do {                                            \
1877   if (BYTE_ASCII_P (c))                         \
1878     Dynarr_add (dst, c);                        \
1879   else if (BYTE_C1_P (c))                       \
1880     {                                           \
1881       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1882       Dynarr_add (dst, c + 0x20);               \
1883     }                                           \
1884   else                                          \
1885     {                                           \
1886       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1887       Dynarr_add (dst, c);                      \
1888     }                                           \
1889 } while (0)
1890 #endif
1891
1892 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1893 do {                                    \
1894   if (ch)                               \
1895     {                                   \
1896       DECODE_ADD_BINARY_CHAR (ch, dst); \
1897       ch = 0;                           \
1898     }                                   \
1899 } while (0)
1900
1901 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1902 do {                                    \
1903   if (flags & CODING_STATE_END)         \
1904     {                                   \
1905       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1906       if (flags & CODING_STATE_CR)      \
1907         Dynarr_add (dst, '\r');         \
1908     }                                   \
1909 } while (0)
1910
1911 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1912
1913 struct decoding_stream
1914 {
1915   /* Coding system that governs the conversion. */
1916   Lisp_Coding_System *codesys;
1917
1918   /* Stream that we read the encoded data from or
1919      write the decoded data to. */
1920   Lstream *other_end;
1921
1922   /* If we are reading, then we can return only a fixed amount of
1923      data, so if the conversion resulted in too much data, we store it
1924      here for retrieval the next time around. */
1925   unsigned_char_dynarr *runoff;
1926
1927   /* FLAGS holds flags indicating the current state of the decoding.
1928      Some of these flags are dependent on the coding system. */
1929   unsigned int flags;
1930
1931   /* CH holds a partially built-up character.  Since we only deal
1932      with one- and two-byte characters at the moment, we only use
1933      this to store the first byte of a two-byte character. */
1934   unsigned int ch;
1935
1936   /* EOL_TYPE specifies the type of end-of-line conversion that
1937      currently applies.  We need to keep this separate from the
1938      EOL type stored in CODESYS because the latter might indicate
1939      automatic EOL-type detection while the former will always
1940      indicate a particular EOL type. */
1941   enum eol_type eol_type;
1942 #ifdef MULE
1943   /* Additional ISO2022 information.  We define the structure above
1944      because it's also needed by the detection routines. */
1945   struct iso2022_decoder iso2022;
1946
1947   /* Additional information (the state of the running CCL program)
1948      used by the CCL decoder. */
1949   struct ccl_program ccl;
1950
1951   /* counter for UTF-8 or UCS-4 */
1952   unsigned char counter;
1953 #endif
1954   struct detection_state decst;
1955 };
1956
1957 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1958 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1959 static int decoding_rewinder   (Lstream *stream);
1960 static int decoding_seekable_p (Lstream *stream);
1961 static int decoding_flusher    (Lstream *stream);
1962 static int decoding_closer     (Lstream *stream);
1963
1964 static Lisp_Object decoding_marker (Lisp_Object stream,
1965                                     void (*markobj) (Lisp_Object));
1966
1967 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1968                                sizeof (struct decoding_stream));
1969
1970 static Lisp_Object
1971 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1972 {
1973   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1974   Lisp_Object str_obj;
1975
1976   /* We do not need to mark the coding systems or charsets stored
1977      within the stream because they are stored in a global list
1978      and automatically marked. */
1979
1980   XSETLSTREAM (str_obj, str);
1981   markobj (str_obj);
1982   if (str->imp->marker)
1983     return (str->imp->marker) (str_obj, markobj);
1984   else
1985     return Qnil;
1986 }
1987
1988 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1989    so we read data from the other end, decode it, and store it into DATA. */
1990
1991 static int
1992 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1993 {
1994   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1995   unsigned char *orig_data = data;
1996   int read_size;
1997   int error_occurred = 0;
1998
1999   /* We need to interface to mule_decode(), which expects to take some
2000      amount of data and store the result into a Dynarr.  We have
2001      mule_decode() store into str->runoff, and take data from there
2002      as necessary. */
2003
2004   /* We loop until we have enough data, reading chunks from the other
2005      end and decoding it. */
2006   while (1)
2007     {
2008       /* Take data from the runoff if we can.  Make sure to take at
2009          most SIZE bytes, and delete the data from the runoff. */
2010       if (Dynarr_length (str->runoff) > 0)
2011         {
2012           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2013           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2014           Dynarr_delete_many (str->runoff, 0, chunk);
2015           data += chunk;
2016           size -= chunk;
2017         }
2018
2019       if (size == 0)
2020         break; /* No more room for data */
2021
2022       if (str->flags & CODING_STATE_END)
2023         /* This means that on the previous iteration, we hit the EOF on
2024            the other end.  We loop once more so that mule_decode() can
2025            output any final stuff it may be holding, or any "go back
2026            to a sane state" escape sequences. (This latter makes sense
2027            during encoding.) */
2028         break;
2029
2030       /* Exhausted the runoff, so get some more.  DATA has at least
2031          SIZE bytes left of storage in it, so it's OK to read directly
2032          into it.  (We'll be overwriting above, after we've decoded it
2033          into the runoff.) */
2034       read_size = Lstream_read (str->other_end, data, size);
2035       if (read_size < 0)
2036         {
2037           error_occurred = 1;
2038           break;
2039         }
2040       if (read_size == 0)
2041         /* There might be some more end data produced in the translation.
2042            See the comment above. */
2043         str->flags |= CODING_STATE_END;
2044       mule_decode (stream, data, str->runoff, read_size);
2045     }
2046
2047   if (data - orig_data == 0)
2048     return error_occurred ? -1 : 0;
2049   else
2050     return data - orig_data;
2051 }
2052
2053 static int
2054 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2055 {
2056   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2057   int retval;
2058
2059   /* Decode all our data into the runoff, and then attempt to write
2060      it all out to the other end.  Remove whatever chunk we succeeded
2061      in writing. */
2062   mule_decode (stream, data, str->runoff, size);
2063   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2064                           Dynarr_length (str->runoff));
2065   if (retval > 0)
2066     Dynarr_delete_many (str->runoff, 0, retval);
2067   /* Do NOT return retval.  The return value indicates how much
2068      of the incoming data was written, not how many bytes were
2069      written. */
2070   return size;
2071 }
2072
2073 static void
2074 reset_decoding_stream (struct decoding_stream *str)
2075 {
2076 #ifdef MULE
2077   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2078     {
2079       Lisp_Object coding_system;
2080       XSETCODING_SYSTEM (coding_system, str->codesys);
2081       reset_iso2022 (coding_system, &str->iso2022);
2082     }
2083   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2084     {
2085       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2086     }
2087   str->counter = 0;
2088 #endif /* MULE */
2089   str->flags = str->ch = 0;
2090 }
2091
2092 static int
2093 decoding_rewinder (Lstream *stream)
2094 {
2095   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2096   reset_decoding_stream (str);
2097   Dynarr_reset (str->runoff);
2098   return Lstream_rewind (str->other_end);
2099 }
2100
2101 static int
2102 decoding_seekable_p (Lstream *stream)
2103 {
2104   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2105   return Lstream_seekable_p (str->other_end);
2106 }
2107
2108 static int
2109 decoding_flusher (Lstream *stream)
2110 {
2111   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2112   return Lstream_flush (str->other_end);
2113 }
2114
2115 static int
2116 decoding_closer (Lstream *stream)
2117 {
2118   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2119   if (stream->flags & LSTREAM_FL_WRITE)
2120     {
2121       str->flags |= CODING_STATE_END;
2122       decoding_writer (stream, 0, 0);
2123     }
2124   Dynarr_free (str->runoff);
2125 #ifdef MULE
2126 #ifdef ENABLE_COMPOSITE_CHARS
2127   if (str->iso2022.composite_chars)
2128     Dynarr_free (str->iso2022.composite_chars);
2129 #endif
2130 #endif
2131   return Lstream_close (str->other_end);
2132 }
2133
2134 Lisp_Object
2135 decoding_stream_coding_system (Lstream *stream)
2136 {
2137   Lisp_Object coding_system;
2138   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2139
2140   XSETCODING_SYSTEM (coding_system, str->codesys);
2141   return subsidiary_coding_system (coding_system, str->eol_type);
2142 }
2143
2144 void
2145 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2146 {
2147   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2148   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2149   str->codesys = cs;
2150   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2151     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2152   reset_decoding_stream (str);
2153 }
2154
2155 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2156    stream for writing, no automatic code detection will be performed.
2157    The reason for this is that automatic code detection requires a
2158    seekable input.  Things will also fail if you open a decoding
2159    stream for reading using a non-fully-specified coding system and
2160    a non-seekable input stream. */
2161
2162 static Lisp_Object
2163 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2164                         CONST char *mode)
2165 {
2166   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2167   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2168   Lisp_Object obj;
2169
2170   xzero (*str);
2171   str->other_end = stream;
2172   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2173   str->eol_type = EOL_AUTODETECT;
2174   if (!strcmp (mode, "r")
2175       && Lstream_seekable_p (stream))
2176     /* We can determine the coding system now. */
2177     determine_real_coding_system (stream, &codesys, &str->eol_type);
2178   set_decoding_stream_coding_system (lstr, codesys);
2179   str->decst.eol_type = str->eol_type;
2180   str->decst.mask = ~0;
2181   XSETLSTREAM (obj, lstr);
2182   return obj;
2183 }
2184
2185 Lisp_Object
2186 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2187 {
2188   return make_decoding_stream_1 (stream, codesys, "r");
2189 }
2190
2191 Lisp_Object
2192 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2193 {
2194   return make_decoding_stream_1 (stream, codesys, "w");
2195 }
2196
2197 /* Note: the decode_coding_* functions all take the same
2198    arguments as mule_decode(), which is to say some SRC data of
2199    size N, which is to be stored into dynamic array DST.
2200    DECODING is the stream within which the decoding is
2201    taking place, but no data is actually read from or
2202    written to that stream; that is handled in decoding_reader()
2203    or decoding_writer().  This allows the same functions to
2204    be used for both reading and writing. */
2205
2206 static void
2207 mule_decode (Lstream *decoding, CONST unsigned char *src,
2208              unsigned_char_dynarr *dst, unsigned int n)
2209 {
2210   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2211
2212   /* If necessary, do encoding-detection now.  We do this when
2213      we're a writing stream or a non-seekable reading stream,
2214      meaning that we can't just process the whole input,
2215      rewind, and start over. */
2216
2217   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2218       str->eol_type == EOL_AUTODETECT)
2219     {
2220       Lisp_Object codesys;
2221
2222       XSETCODING_SYSTEM (codesys, str->codesys);
2223       detect_coding_type (&str->decst, src, n,
2224                           CODING_SYSTEM_TYPE (str->codesys) !=
2225                           CODESYS_AUTODETECT);
2226       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2227           str->decst.mask != ~0)
2228         /* #### This is cheesy.  What we really ought to do is
2229            buffer up a certain amount of data so as to get a
2230            less random result. */
2231         codesys = coding_system_from_mask (str->decst.mask);
2232       str->eol_type = str->decst.eol_type;
2233       if (XCODING_SYSTEM (codesys) != str->codesys)
2234         {
2235           /* Preserve the CODING_STATE_END flag in case it was set.
2236              If we erase it, bad things might happen. */
2237           int was_end = str->flags & CODING_STATE_END;
2238           set_decoding_stream_coding_system (decoding, codesys);
2239           if (was_end)
2240             str->flags |= CODING_STATE_END;
2241         }
2242     }
2243
2244   switch (CODING_SYSTEM_TYPE (str->codesys))
2245     {
2246 #ifdef DEBUG_XEMACS
2247     case CODESYS_INTERNAL:
2248       Dynarr_add_many (dst, src, n);
2249       break;
2250 #endif
2251     case CODESYS_AUTODETECT:
2252       /* If we got this far and still haven't decided on the coding
2253          system, then do no conversion. */
2254     case CODESYS_NO_CONVERSION:
2255       decode_coding_no_conversion (decoding, src, dst, n);
2256       break;
2257 #ifdef MULE
2258     case CODESYS_SHIFT_JIS:
2259       decode_coding_sjis (decoding, src, dst, n);
2260       break;
2261     case CODESYS_BIG5:
2262       decode_coding_big5 (decoding, src, dst, n);
2263       break;
2264     case CODESYS_UCS4:
2265       decode_coding_ucs4 (decoding, src, dst, n);
2266       break;
2267     case CODESYS_UTF8:
2268       decode_coding_utf8 (decoding, src, dst, n);
2269       break;
2270     case CODESYS_CCL:
2271       str->ccl.last_block = str->flags & CODING_STATE_END;
2272       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2273       break;
2274     case CODESYS_ISO2022:
2275       decode_coding_iso2022 (decoding, src, dst, n);
2276       break;
2277 #endif /* MULE */
2278     default:
2279       abort ();
2280     }
2281 }
2282
2283 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2284 Decode the text between START and END which is encoded in CODING-SYSTEM.
2285 This is useful if you've read in encoded text from a file without decoding
2286 it (e.g. you read in a JIS-formatted file but used the `binary' or
2287 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2288 Return length of decoded text.
2289 BUFFER defaults to the current buffer if unspecified.
2290 */
2291        (start, end, coding_system, buffer))
2292 {
2293   Bufpos b, e;
2294   struct buffer *buf = decode_buffer (buffer, 0);
2295   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2296   Lstream *istr, *ostr;
2297   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2298
2299   get_buffer_range_char (buf, start, end, &b, &e, 0);
2300
2301   barf_if_buffer_read_only (buf, b, e);
2302
2303   coding_system = Fget_coding_system (coding_system);
2304   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2305   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2306   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2307                                               coding_system);
2308   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2309                                            Fget_coding_system (Qbinary));
2310   istr = XLSTREAM (instream);
2311   ostr = XLSTREAM (outstream);
2312   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2313
2314   /* The chain of streams looks like this:
2315
2316      [BUFFER] <----- send through
2317                      ------> [ENCODE AS BINARY]
2318                              ------> [DECODE AS SPECIFIED]
2319                                      ------> [BUFFER]
2320    */
2321
2322   while (1)
2323     {
2324       char tempbuf[1024]; /* some random amount */
2325       Bufpos newpos, even_newer_pos;
2326       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2327       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2328
2329       if (!size_in_bytes)
2330         break;
2331       newpos = lisp_buffer_stream_startpos (istr);
2332       Lstream_write (ostr, tempbuf, size_in_bytes);
2333       even_newer_pos = lisp_buffer_stream_startpos (istr);
2334       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2335                            even_newer_pos, 0);
2336     }
2337   Lstream_close (istr);
2338   Lstream_close (ostr);
2339   UNGCPRO;
2340   Lstream_delete (istr);
2341   Lstream_delete (ostr);
2342   Lstream_delete (XLSTREAM (de_outstream));
2343   Lstream_delete (XLSTREAM (lb_outstream));
2344   return Qnil;
2345 }
2346
2347 \f
2348 /************************************************************************/
2349 /*           Converting to an external encoding ("encoding")            */
2350 /************************************************************************/
2351
2352 /* An encoding stream is an output stream.  When you create the
2353    stream, you specify the coding system that governs the encoding
2354    and another stream that the resulting encoded data is to be
2355    sent to, and then start sending data to it. */
2356
2357 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2358
2359 struct encoding_stream
2360 {
2361   /* Coding system that governs the conversion. */
2362   Lisp_Coding_System *codesys;
2363
2364   /* Stream that we read the encoded data from or
2365      write the decoded data to. */
2366   Lstream *other_end;
2367
2368   /* If we are reading, then we can return only a fixed amount of
2369      data, so if the conversion resulted in too much data, we store it
2370      here for retrieval the next time around. */
2371   unsigned_char_dynarr *runoff;
2372
2373   /* FLAGS holds flags indicating the current state of the encoding.
2374      Some of these flags are dependent on the coding system. */
2375   unsigned int flags;
2376
2377   /* CH holds a partially built-up character.  Since we only deal
2378      with one- and two-byte characters at the moment, we only use
2379      this to store the first byte of a two-byte character. */
2380   unsigned int ch;
2381 #ifdef MULE
2382   /* Additional information used by the ISO2022 encoder. */
2383   struct
2384     {
2385       /* CHARSET holds the character sets currently assigned to the G0
2386          through G3 registers.  It is initialized from the array
2387          INITIAL_CHARSET in CODESYS. */
2388       Lisp_Object charset[4];
2389
2390       /* Which registers are currently invoked into the left (GL) and
2391          right (GR) halves of the 8-bit encoding space? */
2392       int register_left, register_right;
2393
2394       /* Whether we need to explicitly designate the charset in the
2395          G? register before using it.  It is initialized from the
2396          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2397       unsigned char force_charset_on_output[4];
2398
2399       /* Other state variables that need to be preserved across
2400          invocations. */
2401       Lisp_Object current_charset;
2402       int current_half;
2403       int current_char_boundary;
2404     } iso2022;
2405
2406   void (*encode_char) (struct encoding_stream *str, Emchar c,
2407                        unsigned_char_dynarr *dst, unsigned int *flags);
2408   void (*finish) (struct encoding_stream *str,
2409                   unsigned_char_dynarr *dst, unsigned int *flags);
2410
2411   /* Additional information (the state of the running CCL program)
2412      used by the CCL encoder. */
2413   struct ccl_program ccl;
2414 #endif /* MULE */
2415 };
2416
2417 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2418 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2419                             size_t size);
2420 static int encoding_rewinder   (Lstream *stream);
2421 static int encoding_seekable_p (Lstream *stream);
2422 static int encoding_flusher    (Lstream *stream);
2423 static int encoding_closer     (Lstream *stream);
2424
2425 static Lisp_Object encoding_marker (Lisp_Object stream,
2426                                     void (*markobj) (Lisp_Object));
2427
2428 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2429                                sizeof (struct encoding_stream));
2430
2431 static Lisp_Object
2432 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2433 {
2434   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2435   Lisp_Object str_obj;
2436
2437   /* We do not need to mark the coding systems or charsets stored
2438      within the stream because they are stored in a global list
2439      and automatically marked. */
2440
2441   XSETLSTREAM (str_obj, str);
2442   markobj (str_obj);
2443   if (str->imp->marker)
2444     return (str->imp->marker) (str_obj, markobj);
2445   else
2446     return Qnil;
2447 }
2448
2449 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2450    so we read data from the other end, encode it, and store it into DATA. */
2451
2452 static int
2453 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2454 {
2455   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2456   unsigned char *orig_data = data;
2457   int read_size;
2458   int error_occurred = 0;
2459
2460   /* We need to interface to mule_encode(), which expects to take some
2461      amount of data and store the result into a Dynarr.  We have
2462      mule_encode() store into str->runoff, and take data from there
2463      as necessary. */
2464
2465   /* We loop until we have enough data, reading chunks from the other
2466      end and encoding it. */
2467   while (1)
2468     {
2469       /* Take data from the runoff if we can.  Make sure to take at
2470          most SIZE bytes, and delete the data from the runoff. */
2471       if (Dynarr_length (str->runoff) > 0)
2472         {
2473           int chunk = min ((int) size, Dynarr_length (str->runoff));
2474           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2475           Dynarr_delete_many (str->runoff, 0, chunk);
2476           data += chunk;
2477           size -= chunk;
2478         }
2479
2480       if (size == 0)
2481         break; /* No more room for data */
2482
2483       if (str->flags & CODING_STATE_END)
2484         /* This means that on the previous iteration, we hit the EOF on
2485            the other end.  We loop once more so that mule_encode() can
2486            output any final stuff it may be holding, or any "go back
2487            to a sane state" escape sequences. (This latter makes sense
2488            during encoding.) */
2489         break;
2490
2491       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2492          left of storage in it, so it's OK to read directly into it.
2493          (We'll be overwriting above, after we've encoded it into the
2494          runoff.) */
2495       read_size = Lstream_read (str->other_end, data, size);
2496       if (read_size < 0)
2497         {
2498           error_occurred = 1;
2499           break;
2500         }
2501       if (read_size == 0)
2502         /* There might be some more end data produced in the translation.
2503            See the comment above. */
2504         str->flags |= CODING_STATE_END;
2505       mule_encode (stream, data, str->runoff, read_size);
2506     }
2507
2508   if (data == orig_data)
2509     return error_occurred ? -1 : 0;
2510   else
2511     return data - orig_data;
2512 }
2513
2514 static int
2515 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2516 {
2517   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2518   int retval;
2519
2520   /* Encode all our data into the runoff, and then attempt to write
2521      it all out to the other end.  Remove whatever chunk we succeeded
2522      in writing. */
2523   mule_encode (stream, data, str->runoff, size);
2524   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2525                           Dynarr_length (str->runoff));
2526   if (retval > 0)
2527     Dynarr_delete_many (str->runoff, 0, retval);
2528   /* Do NOT return retval.  The return value indicates how much
2529      of the incoming data was written, not how many bytes were
2530      written. */
2531   return size;
2532 }
2533
2534 static void
2535 reset_encoding_stream (struct encoding_stream *str)
2536 {
2537 #ifdef MULE
2538   switch (CODING_SYSTEM_TYPE (str->codesys))
2539     {
2540     case CODESYS_ISO2022:
2541       {
2542         int i;
2543
2544         str->encode_char = &char_encode_iso2022;
2545         str->finish = &char_finish_iso2022;
2546         for (i = 0; i < 4; i++)
2547           {
2548             str->iso2022.charset[i] =
2549               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2550             str->iso2022.force_charset_on_output[i] =
2551               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2552           }
2553         str->iso2022.register_left = 0;
2554         str->iso2022.register_right = 1;
2555         str->iso2022.current_charset = Qnil;
2556         str->iso2022.current_half = 0;
2557         break;
2558       }
2559     case CODESYS_CCL:
2560       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2561       break;
2562     case CODESYS_UTF8:
2563       str->encode_char = &char_encode_utf8;
2564       str->finish = &char_finish_utf8;
2565     case CODESYS_UCS4:
2566       str->encode_char = &char_encode_ucs4;
2567       str->finish = &char_finish_ucs4;
2568     case CODESYS_SHIFT_JIS:
2569       str->encode_char = &char_encode_shift_jis;
2570       str->finish = &char_finish_shift_jis;
2571     default:
2572       break;
2573     }
2574 #endif /* MULE */
2575   str->iso2022.current_char_boundary = 0;
2576   str->flags = str->ch = 0;
2577 }
2578
2579 static int
2580 encoding_rewinder (Lstream *stream)
2581 {
2582   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2583   reset_encoding_stream (str);
2584   Dynarr_reset (str->runoff);
2585   return Lstream_rewind (str->other_end);
2586 }
2587
2588 static int
2589 encoding_seekable_p (Lstream *stream)
2590 {
2591   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2592   return Lstream_seekable_p (str->other_end);
2593 }
2594
2595 static int
2596 encoding_flusher (Lstream *stream)
2597 {
2598   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2599   return Lstream_flush (str->other_end);
2600 }
2601
2602 static int
2603 encoding_closer (Lstream *stream)
2604 {
2605   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2606   if (stream->flags & LSTREAM_FL_WRITE)
2607     {
2608       str->flags |= CODING_STATE_END;
2609       encoding_writer (stream, 0, 0);
2610     }
2611   Dynarr_free (str->runoff);
2612   return Lstream_close (str->other_end);
2613 }
2614
2615 Lisp_Object
2616 encoding_stream_coding_system (Lstream *stream)
2617 {
2618   Lisp_Object coding_system;
2619   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2620
2621   XSETCODING_SYSTEM (coding_system, str->codesys);
2622   return coding_system;
2623 }
2624
2625 void
2626 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2627 {
2628   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2629   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2630   str->codesys = cs;
2631   reset_encoding_stream (str);
2632 }
2633
2634 static Lisp_Object
2635 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2636                         CONST char *mode)
2637 {
2638   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2639   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2640   Lisp_Object obj;
2641
2642   xzero (*str);
2643   str->runoff = Dynarr_new (unsigned_char);
2644   str->other_end = stream;
2645   set_encoding_stream_coding_system (lstr, codesys);
2646   XSETLSTREAM (obj, lstr);
2647   return obj;
2648 }
2649
2650 Lisp_Object
2651 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2652 {
2653   return make_encoding_stream_1 (stream, codesys, "r");
2654 }
2655
2656 Lisp_Object
2657 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2658 {
2659   return make_encoding_stream_1 (stream, codesys, "w");
2660 }
2661
2662 /* Convert N bytes of internally-formatted data stored in SRC to an
2663    external format, according to the encoding stream ENCODING.
2664    Store the encoded data into DST. */
2665
2666 static void
2667 mule_encode (Lstream *encoding, CONST unsigned char *src,
2668              unsigned_char_dynarr *dst, unsigned int n)
2669 {
2670   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2671
2672   switch (CODING_SYSTEM_TYPE (str->codesys))
2673     {
2674 #ifdef DEBUG_XEMACS
2675     case CODESYS_INTERNAL:
2676       Dynarr_add_many (dst, src, n);
2677       break;
2678 #endif
2679     case CODESYS_AUTODETECT:
2680       /* If we got this far and still haven't decided on the coding
2681          system, then do no conversion. */
2682     case CODESYS_NO_CONVERSION:
2683       encode_coding_no_conversion (encoding, src, dst, n);
2684       break;
2685 #ifdef MULE
2686     case CODESYS_BIG5:
2687       encode_coding_big5 (encoding, src, dst, n);
2688       break;
2689     case CODESYS_CCL:
2690       str->ccl.last_block = str->flags & CODING_STATE_END;
2691       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2692       break;
2693 #endif /* MULE */
2694     default:
2695       text_encode_generic (encoding, src, dst, n);
2696     }
2697 }
2698
2699 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2700 Encode the text between START and END using CODING-SYSTEM.
2701 This will, for example, convert Japanese characters into stuff such as
2702 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2703 text.  BUFFER defaults to the current buffer if unspecified.
2704 */
2705        (start, end, coding_system, buffer))
2706 {
2707   Bufpos b, e;
2708   struct buffer *buf = decode_buffer (buffer, 0);
2709   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2710   Lstream *istr, *ostr;
2711   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2712
2713   get_buffer_range_char (buf, start, end, &b, &e, 0);
2714
2715   barf_if_buffer_read_only (buf, b, e);
2716
2717   coding_system = Fget_coding_system (coding_system);
2718   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2719   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2720   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2721                                               Fget_coding_system (Qbinary));
2722   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2723                                            coding_system);
2724   istr = XLSTREAM (instream);
2725   ostr = XLSTREAM (outstream);
2726   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2727   /* The chain of streams looks like this:
2728
2729      [BUFFER] <----- send through
2730                      ------> [ENCODE AS SPECIFIED]
2731                              ------> [DECODE AS BINARY]
2732                                      ------> [BUFFER]
2733    */
2734   while (1)
2735     {
2736       char tempbuf[1024]; /* some random amount */
2737       Bufpos newpos, even_newer_pos;
2738       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2739       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2740
2741       if (!size_in_bytes)
2742         break;
2743       newpos = lisp_buffer_stream_startpos (istr);
2744       Lstream_write (ostr, tempbuf, size_in_bytes);
2745       even_newer_pos = lisp_buffer_stream_startpos (istr);
2746       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2747                            even_newer_pos, 0);
2748     }
2749
2750   {
2751     Charcount retlen =
2752       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2753     Lstream_close (istr);
2754     Lstream_close (ostr);
2755     UNGCPRO;
2756     Lstream_delete (istr);
2757     Lstream_delete (ostr);
2758     Lstream_delete (XLSTREAM (de_outstream));
2759     Lstream_delete (XLSTREAM (lb_outstream));
2760     return make_int (retlen);
2761   }
2762 }
2763
2764 #ifdef MULE
2765 \f
2766 static void
2767 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2768                      unsigned_char_dynarr *dst, unsigned int n)
2769 {
2770   unsigned char c;
2771   unsigned char char_boundary;
2772   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2773   unsigned int flags          = str->flags;
2774   Emchar ch                   = str->ch;
2775
2776   char_boundary = str->iso2022.current_char_boundary;
2777
2778   while (n--)
2779     {
2780       c = *src++;
2781
2782       if (char_boundary == 0)
2783         {
2784           if (c >= 0xfc)
2785             {
2786               ch = c & 0x01;
2787               char_boundary = 5;
2788             }
2789           else if (c >= 0xf8)
2790             {
2791               ch = c & 0x03;
2792               char_boundary = 4;
2793             }
2794           else if (c >= 0xf0)
2795             {
2796               ch = c & 0x07;
2797               char_boundary = 3;
2798             }
2799           else if (c >= 0xe0)
2800             {
2801               ch = c & 0x0f;
2802               char_boundary = 2;
2803             }
2804           else if (c >= 0xc0)
2805             {
2806               ch = c & 0x1f;
2807               char_boundary = 1;
2808             }
2809           else
2810             (*str->encode_char) (str, c, dst, &flags);
2811         }
2812       else if (char_boundary == 1)
2813         {
2814           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2815           ch =0;
2816           char_boundary = 0;
2817         }
2818       else
2819         {
2820           ch = (ch << 6) | (c & 0x3f);
2821           char_boundary--;
2822         }
2823     }
2824
2825   if ((char_boundary == 0) && (flags & CODING_STATE_END))
2826     {
2827       (*str->finish) (str, dst, &flags);
2828     }
2829
2830   str->flags = flags;
2831   str->ch    = ch;
2832   str->iso2022.current_char_boundary = char_boundary;
2833 }
2834
2835 \f
2836 /************************************************************************/
2837 /*                          Shift-JIS methods                           */
2838 /************************************************************************/
2839
2840 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2841    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2842    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2843    encoded by "position-code + 0x80".  A character of JISX0208
2844    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2845    position-codes are divided and shifted so that it fit in the range
2846    below.
2847
2848    --- CODE RANGE of Shift-JIS ---
2849    (character set)      (range)
2850    ASCII                0x00 .. 0x7F
2851    JISX0201-Kana        0xA0 .. 0xDF
2852    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2853             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2854    -------------------------------
2855
2856 */
2857
2858 /* Is this the first byte of a Shift-JIS two-byte char? */
2859
2860 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2861   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2862
2863 /* Is this the second byte of a Shift-JIS two-byte char? */
2864
2865 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2866   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2867
2868 #define BYTE_SJIS_KATAKANA_P(c) \
2869   ((c) >= 0xA1 && (c) <= 0xDF)
2870
2871 static int
2872 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2873                     unsigned int n)
2874 {
2875   int c;
2876
2877   while (n--)
2878     {
2879       c = *src++;
2880       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2881         return 0;
2882       if (st->shift_jis.in_second_byte)
2883         {
2884           st->shift_jis.in_second_byte = 0;
2885           if (c < 0x40)
2886             return 0;
2887         }
2888       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2889         st->shift_jis.in_second_byte = 1;
2890     }
2891   return CODING_CATEGORY_SHIFT_JIS_MASK;
2892 }
2893
2894 /* Convert Shift-JIS data to internal format. */
2895
2896 static void
2897 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2898                     unsigned_char_dynarr *dst, unsigned int n)
2899 {
2900   unsigned char c;
2901   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2902   unsigned int flags  = str->flags;
2903   unsigned int ch     = str->ch;
2904   eol_type_t eol_type = str->eol_type;
2905
2906   while (n--)
2907     {
2908       c = *src++;
2909
2910       if (ch)
2911         {
2912           /* Previous character was first byte of Shift-JIS Kanji char. */
2913           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2914             {
2915               unsigned char e1, e2;
2916
2917               DECODE_SJIS (ch, c, e1, e2);
2918 #ifdef UTF2000
2919               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2920                                             e1 & 0x7F,
2921                                             e2 & 0x7F), dst);
2922 #else
2923               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2924               Dynarr_add (dst, e1);
2925               Dynarr_add (dst, e2);
2926 #endif
2927             }
2928           else
2929             {
2930               DECODE_ADD_BINARY_CHAR (ch, dst);
2931               DECODE_ADD_BINARY_CHAR (c, dst);
2932             }
2933           ch = 0;
2934         }
2935       else
2936         {
2937           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2938           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2939             ch = c;
2940           else if (BYTE_SJIS_KATAKANA_P (c))
2941             {
2942 #ifdef UTF2000
2943               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2944                                             c & 0x7F, 0), dst);
2945 #else
2946               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2947               Dynarr_add (dst, c);
2948 #endif
2949             }
2950 #ifdef UTF2000
2951           else if (c > 32)
2952             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
2953                                           c, 0), dst);
2954 #endif
2955           else
2956             DECODE_ADD_BINARY_CHAR (c, dst);
2957         }
2958     label_continue_loop:;
2959     }
2960
2961   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2962
2963   str->flags = flags;
2964   str->ch    = ch;
2965 }
2966
2967 /* Convert internal character representation to Shift_JIS. */
2968
2969 void
2970 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
2971                        unsigned_char_dynarr *dst, unsigned int *flags)
2972 {
2973   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2974
2975   if (ch == '\n')
2976     {
2977       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2978         Dynarr_add (dst, '\r');
2979       if (eol_type != EOL_CR)
2980         Dynarr_add (dst, ch);
2981     }
2982   else
2983     {
2984       Lisp_Object charset;
2985       unsigned int c1, c2, s1, s2;
2986
2987 #ifdef UTF2000
2988       if ( (c1 = get_byte_from_character_table (ch, ucs_to_latin_jisx0201)) )
2989         {
2990           charset = Vcharset_latin_jisx0201;
2991           c2 = 0;
2992         }
2993       else
2994 #endif
2995         BREAKUP_CHAR (ch, charset, c1, c2);
2996
2997       if (EQ(charset, Vcharset_katakana_jisx0201))
2998         {
2999           Dynarr_add (dst, c1 | 0x80);
3000         }
3001       else if (c2 == 0)
3002         {
3003           Dynarr_add (dst, c1);
3004         }
3005       else if (EQ(charset, Vcharset_japanese_jisx0208))
3006         {
3007           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3008           Dynarr_add (dst, s1);
3009           Dynarr_add (dst, s2);
3010         }
3011       else
3012         Dynarr_add (dst, '?');
3013     }
3014 }
3015
3016 void
3017 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3018                        unsigned int *flags)
3019 {
3020 }
3021
3022 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3023 Decode a JISX0208 character of Shift-JIS coding-system.
3024 CODE is the character code in Shift-JIS as a cons of type bytes.
3025 Return the corresponding character.
3026 */
3027        (code))
3028 {
3029   unsigned char c1, c2, s1, s2;
3030
3031   CHECK_CONS (code);
3032   CHECK_INT (XCAR (code));
3033   CHECK_INT (XCDR (code));
3034   s1 = XINT (XCAR (code));
3035   s2 = XINT (XCDR (code));
3036   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3037       BYTE_SJIS_TWO_BYTE_2_P (s2))
3038     {
3039       DECODE_SJIS (s1, s2, c1, c2);
3040       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3041                                    c1 & 0x7F, c2 & 0x7F));
3042     }
3043   else
3044     return Qnil;
3045 }
3046
3047 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3048 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3049 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3050 */
3051        (ch))
3052 {
3053   Lisp_Object charset;
3054   int c1, c2, s1, s2;
3055
3056   CHECK_CHAR_COERCE_INT (ch);
3057   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3058   if (EQ (charset, Vcharset_japanese_jisx0208))
3059     {
3060       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3061       return Fcons (make_int (s1), make_int (s2));
3062     }
3063   else
3064     return Qnil;
3065 }
3066
3067 \f
3068 /************************************************************************/
3069 /*                            Big5 methods                              */
3070 /************************************************************************/
3071
3072 /* BIG5 is a coding system encoding two character sets: ASCII and
3073    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3074    character set and is encoded in two-byte.
3075
3076    --- CODE RANGE of BIG5 ---
3077    (character set)      (range)
3078    ASCII                0x00 .. 0x7F
3079    Big5 (1st byte)      0xA1 .. 0xFE
3080         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3081    --------------------------
3082
3083    Since the number of characters in Big5 is larger than maximum
3084    characters in Emacs' charset (96x96), it can't be handled as one
3085    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3086    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3087    contains frequently used characters and the latter contains less
3088    frequently used characters.  */
3089
3090 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3091   ((c) >= 0xA1 && (c) <= 0xFE)
3092
3093 /* Is this the second byte of a Shift-JIS two-byte char? */
3094
3095 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3096   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3097
3098 /* Number of Big5 characters which have the same code in 1st byte.  */
3099
3100 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3101
3102 /* Code conversion macros.  These are macros because they are used in
3103    inner loops during code conversion.
3104
3105    Note that temporary variables in macros introduce the classic
3106    dynamic-scoping problems with variable names.  We use capital-
3107    lettered variables in the assumption that XEmacs does not use
3108    capital letters in variables except in a very formalized way
3109    (e.g. Qstring). */
3110
3111 /* Convert Big5 code (b1, b2) into its internal string representation
3112    (lb, c1, c2). */
3113
3114 /* There is a much simpler way to split the Big5 charset into two.
3115    For the moment I'm going to leave the algorithm as-is because it
3116    claims to separate out the most-used characters into a single
3117    charset, which perhaps will lead to optimizations in various
3118    places.
3119
3120    The way the algorithm works is something like this:
3121
3122    Big5 can be viewed as a 94x157 charset, where the row is
3123    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3124    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3125    the split between low and high column numbers is apparently
3126    meaningless; ascending rows produce less and less frequent chars.
3127    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3128    the first charset, and the upper half (0xC9 .. 0xFE) to the
3129    second.  To do the conversion, we convert the character into
3130    a single number where 0 .. 156 is the first row, 157 .. 313
3131    is the second, etc.  That way, the characters are ordered by
3132    decreasing frequency.  Then we just chop the space in two
3133    and coerce the result into a 94x94 space.
3134    */
3135
3136 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3137 {                                                                       \
3138   int B1 = b1, B2 = b2;                                                 \
3139   unsigned int I                                                        \
3140     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3141                                                                         \
3142   if (B1 < 0xC9)                                                        \
3143     {                                                                   \
3144       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3145     }                                                                   \
3146   else                                                                  \
3147     {                                                                   \
3148       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3149       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3150     }                                                                   \
3151   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3152   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3153 } while (0)
3154
3155 /* Convert the internal string representation of a Big5 character
3156    (lb, c1, c2) into Big5 code (b1, b2). */
3157
3158 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3159 {                                                                       \
3160   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3161                                                                         \
3162   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3163     {                                                                   \
3164       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3165     }                                                                   \
3166   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3167   b2 = I % BIG5_SAME_ROW;                                               \
3168   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3169 } while (0)
3170
3171 static int
3172 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3173                     unsigned int n)
3174 {
3175   int c;
3176
3177   while (n--)
3178     {
3179       c = *src++;
3180       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3181           (c >= 0x80 && c <= 0xA0))
3182         return 0;
3183       if (st->big5.in_second_byte)
3184         {
3185           st->big5.in_second_byte = 0;
3186           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3187             return 0;
3188         }
3189       else if (c >= 0xA1)
3190         st->big5.in_second_byte = 1;
3191     }
3192   return CODING_CATEGORY_BIG5_MASK;
3193 }
3194
3195 /* Convert Big5 data to internal format. */
3196
3197 static void
3198 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3199                     unsigned_char_dynarr *dst, unsigned int n)
3200 {
3201   unsigned char c;
3202   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3203   unsigned int flags  = str->flags;
3204   unsigned int ch     = str->ch;
3205   eol_type_t eol_type = str->eol_type;
3206
3207   while (n--)
3208     {
3209       c = *src++;
3210       if (ch)
3211         {
3212           /* Previous character was first byte of Big5 char. */
3213           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3214             {
3215               unsigned char b1, b2, b3;
3216               DECODE_BIG5 (ch, c, b1, b2, b3);
3217               Dynarr_add (dst, b1);
3218               Dynarr_add (dst, b2);
3219               Dynarr_add (dst, b3);
3220             }
3221           else
3222             {
3223               DECODE_ADD_BINARY_CHAR (ch, dst);
3224               DECODE_ADD_BINARY_CHAR (c, dst);
3225             }
3226           ch = 0;
3227         }
3228       else
3229         {
3230           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3231           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3232             ch = c;
3233           else
3234             DECODE_ADD_BINARY_CHAR (c, dst);
3235         }
3236     label_continue_loop:;
3237     }
3238
3239   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3240
3241   str->flags = flags;
3242   str->ch    = ch;
3243 }
3244
3245 /* Convert internally-formatted data to Big5. */
3246
3247 static void
3248 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3249                     unsigned_char_dynarr *dst, unsigned int n)
3250 {
3251 #ifndef UTF2000
3252   unsigned char c;
3253   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3254   unsigned int flags  = str->flags;
3255   unsigned int ch     = str->ch;
3256   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3257
3258   while (n--)
3259     {
3260       c = *src++;
3261       if (c == '\n')
3262         {
3263           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3264             Dynarr_add (dst, '\r');
3265           if (eol_type != EOL_CR)
3266             Dynarr_add (dst, '\n');
3267         }
3268       else if (BYTE_ASCII_P (c))
3269         {
3270           /* ASCII. */
3271           Dynarr_add (dst, c);
3272         }
3273       else if (BUFBYTE_LEADING_BYTE_P (c))
3274         {
3275           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3276               c == LEADING_BYTE_CHINESE_BIG5_2)
3277             {
3278               /* A recognized leading byte. */
3279               ch = c;
3280               continue; /* not done with this character. */
3281             }
3282           /* otherwise just ignore this character. */
3283         }
3284       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3285                ch == LEADING_BYTE_CHINESE_BIG5_2)
3286         {
3287           /* Previous char was a recognized leading byte. */
3288           ch = (ch << 8) | c;
3289           continue; /* not done with this character. */
3290         }
3291       else if (ch)
3292         {
3293           /* Encountering second byte of a Big5 character. */
3294           unsigned char b1, b2;
3295
3296           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3297           Dynarr_add (dst, b1);
3298           Dynarr_add (dst, b2);
3299         }
3300
3301       ch = 0;
3302     }
3303
3304   str->flags = flags;
3305   str->ch    = ch;
3306 #endif
3307 }
3308
3309
3310 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3311 Decode a Big5 character CODE of BIG5 coding-system.
3312 CODE is the character code in BIG5, a cons of two integers.
3313 Return the corresponding character.
3314 */
3315        (code))
3316 {
3317   unsigned char c1, c2, b1, b2;
3318
3319   CHECK_CONS (code);
3320   CHECK_INT (XCAR (code));
3321   CHECK_INT (XCDR (code));
3322   b1 = XINT (XCAR (code));
3323   b2 = XINT (XCDR (code));
3324   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3325       BYTE_BIG5_TWO_BYTE_2_P (b2))
3326     {
3327       Charset_ID leading_byte;
3328       Lisp_Object charset;
3329       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3330       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3331       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3332     }
3333   else
3334     return Qnil;
3335 }
3336
3337 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3338 Encode the Big5 character CH to BIG5 coding-system.
3339 Return the corresponding character code in Big5.
3340 */
3341        (ch))
3342 {
3343   Lisp_Object charset;
3344   int c1, c2, b1, b2;
3345
3346   CHECK_CHAR_COERCE_INT (ch);
3347   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3348   if (EQ (charset, Vcharset_chinese_big5_1) ||
3349       EQ (charset, Vcharset_chinese_big5_2))
3350     {
3351       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3352                    b1, b2);
3353       return Fcons (make_int (b1), make_int (b2));
3354     }
3355   else
3356     return Qnil;
3357 }
3358
3359 \f
3360 /************************************************************************/
3361 /*                           UCS-4 methods                              */
3362 /************************************************************************/
3363
3364 static int
3365 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3366                     unsigned int n)
3367 {
3368   while (n--)
3369     {
3370       int c = *src++;
3371       switch (st->ucs4.in_byte)
3372         {
3373         case 0:
3374           if (c >= 128)
3375             return 0;
3376           else
3377             st->ucs4.in_byte++;
3378           break;
3379         case 3:
3380           st->ucs4.in_byte = 0;
3381           break;
3382         default:
3383           st->ucs4.in_byte++;
3384         }
3385     }
3386   return CODING_CATEGORY_UCS4_MASK;
3387 }
3388
3389 static void
3390 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3391                     unsigned_char_dynarr *dst, unsigned int n)
3392 {
3393   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3394   unsigned int flags = str->flags;
3395   unsigned int ch    = str->ch;
3396   unsigned char counter = str->counter;
3397
3398   while (n--)
3399     {
3400       unsigned char c = *src++;
3401       switch (counter)
3402         {
3403         case 0:
3404           ch = c;
3405           counter = 3;
3406           break;
3407         case 1:
3408           DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3409           ch = 0;
3410           counter = 0;
3411           break;
3412         default:
3413           ch = ( ch << 8 ) | c;
3414           counter--;
3415         }
3416     }
3417   if (counter & CODING_STATE_END)
3418     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3419
3420   str->flags = flags;
3421   str->ch    = ch;
3422   str->counter = counter;
3423 }
3424
3425 void
3426 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3427                   unsigned_char_dynarr *dst, unsigned int *flags)
3428 {
3429   Dynarr_add (dst,  ch >> 24);
3430   Dynarr_add (dst, (ch >> 16) & 255);
3431   Dynarr_add (dst, (ch >>  8) & 255);
3432   Dynarr_add (dst,  ch        & 255);
3433 }
3434
3435 void
3436 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3437                   unsigned int *flags)
3438 {
3439 }
3440
3441 \f
3442 /************************************************************************/
3443 /*                           UTF-8 methods                              */
3444 /************************************************************************/
3445
3446 static int
3447 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3448                     unsigned int n)
3449 {
3450   while (n--)
3451     {
3452       unsigned char c = *src++;
3453       switch (st->utf8.in_byte)
3454         {
3455         case 0:
3456           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3457             return 0;
3458           else if (c >= 0xfc)
3459             st->utf8.in_byte = 5;
3460           else if (c >= 0xf8)
3461             st->utf8.in_byte = 4;
3462           else if (c >= 0xf0)
3463             st->utf8.in_byte = 3;
3464           else if (c >= 0xe0)
3465             st->utf8.in_byte = 2;
3466           else if (c >= 0xc0)
3467             st->utf8.in_byte = 1;
3468           else if (c >= 0x80)
3469             return 0;
3470           break;
3471         default:
3472           if ((c & 0xc0) != 0x80)
3473             return 0;
3474           else
3475             st->utf8.in_byte--;
3476         }
3477     }
3478   return CODING_CATEGORY_UTF8_MASK;
3479 }
3480
3481 static void
3482 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3483                     unsigned_char_dynarr *dst, unsigned int n)
3484 {
3485   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3486   unsigned int flags  = str->flags;
3487   unsigned int ch     = str->ch;
3488   eol_type_t eol_type = str->eol_type;
3489   unsigned char counter = str->counter;
3490
3491   while (n--)
3492     {
3493       unsigned char c = *src++;
3494       switch (counter)
3495         {
3496         case 0:
3497           if ( c >= 0xfc )
3498             {
3499               ch = c & 0x01;
3500               counter = 5;
3501             }
3502           else if ( c >= 0xf8 )
3503             {
3504               ch = c & 0x03;
3505               counter = 4;
3506             }
3507           else if ( c >= 0xf0 )
3508             {
3509               ch = c & 0x07;
3510               counter = 3;
3511             }
3512           else if ( c >= 0xe0 )
3513             {
3514               ch = c & 0x0f;
3515               counter = 2;
3516             }
3517           else if ( c >= 0xc0 )
3518             {
3519               ch = c & 0x1f;
3520               counter = 1;
3521             }
3522           else
3523             {
3524               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3525               DECODE_ADD_UCS_CHAR (c, dst);
3526             }
3527           break;
3528         case 1:
3529           ch = ( ch << 6 ) | ( c & 0x3f );
3530           DECODE_ADD_UCS_CHAR (ch, dst);
3531           ch = 0;
3532           counter = 0;
3533           break;
3534         default:
3535           ch = ( ch << 6 ) | ( c & 0x3f );
3536           counter--;
3537         }
3538     label_continue_loop:;
3539     }
3540
3541   if (flags & CODING_STATE_END)
3542     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3543
3544   str->flags = flags;
3545   str->ch    = ch;
3546   str->counter = counter;
3547 }
3548
3549 void
3550 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3551                   unsigned_char_dynarr *dst, unsigned int *flags)
3552 {
3553   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3554
3555   if (ch == '\n')
3556     {
3557       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3558         Dynarr_add (dst, '\r');
3559       if (eol_type != EOL_CR)
3560         Dynarr_add (dst, ch);
3561     }
3562   else if (ch <= 0x7f)
3563     {
3564       Dynarr_add (dst, ch);
3565     }
3566   else if (ch <= 0x7ff)
3567     {
3568       Dynarr_add (dst, (ch >> 6) | 0xc0);
3569       Dynarr_add (dst, (ch & 0x3f) | 0x80);
3570     }
3571   else if (ch <= 0xffff)
3572     {
3573       Dynarr_add (dst,  (ch >> 12) | 0xe0);
3574       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3575       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3576     }
3577   else if (ch <= 0x1fffff)
3578     {
3579       Dynarr_add (dst,  (ch >> 18) | 0xf0);
3580       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3581       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3582       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3583     }
3584   else if (ch <= 0x3ffffff)
3585     {
3586       Dynarr_add (dst,  (ch >> 24) | 0xf8);
3587       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3588       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3589       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3590       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3591     }
3592   else
3593     {
3594       Dynarr_add (dst,  (ch >> 30) | 0xfc);
3595       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3596       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3597       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3598       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3599       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3600     }
3601 }
3602
3603 void
3604 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3605                   unsigned int *flags)
3606 {
3607 }
3608
3609 \f
3610 /************************************************************************/
3611 /*                           ISO2022 methods                            */
3612 /************************************************************************/
3613
3614 /* The following note describes the coding system ISO2022 briefly.
3615    Since the intention of this note is to help understand the
3616    functions in this file, some parts are NOT ACCURATE or OVERLY
3617    SIMPLIFIED.  For thorough understanding, please refer to the
3618    original document of ISO2022.
3619
3620    ISO2022 provides many mechanisms to encode several character sets
3621    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3622    is encoded using bytes less than 128.  This may make the encoded
3623    text a little bit longer, but the text passes more easily through
3624    several gateways, some of which strip off MSB (Most Signigant Bit).
3625
3626    There are two kinds of character sets: control character set and
3627    graphic character set.  The former contains control characters such
3628    as `newline' and `escape' to provide control functions (control
3629    functions are also provided by escape sequences).  The latter
3630    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3631    two control character sets and many graphic character sets.
3632
3633    Graphic character sets are classified into one of the following
3634    four classes, according to the number of bytes (DIMENSION) and
3635    number of characters in one dimension (CHARS) of the set:
3636    - DIMENSION1_CHARS94
3637    - DIMENSION1_CHARS96
3638    - DIMENSION2_CHARS94
3639    - DIMENSION2_CHARS96
3640
3641    In addition, each character set is assigned an identification tag,
3642    unique for each set, called "final character" (denoted as <F>
3643    hereafter).  The <F> of each character set is decided by ECMA(*)
3644    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3645    (0x30..0x3F are for private use only).
3646
3647    Note (*): ECMA = European Computer Manufacturers Association
3648
3649    Here are examples of graphic character set [NAME(<F>)]:
3650         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3651         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3652         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3653         o DIMENSION2_CHARS96 -- none for the moment
3654
3655    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3656         C0 [0x00..0x1F] -- control character plane 0
3657         GL [0x20..0x7F] -- graphic character plane 0
3658         C1 [0x80..0x9F] -- control character plane 1
3659         GR [0xA0..0xFF] -- graphic character plane 1
3660
3661    A control character set is directly designated and invoked to C0 or
3662    C1 by an escape sequence.  The most common case is that:
3663    - ISO646's  control character set is designated/invoked to C0, and
3664    - ISO6429's control character set is designated/invoked to C1,
3665    and usually these designations/invocations are omitted in encoded
3666    text.  In a 7-bit environment, only C0 can be used, and a control
3667    character for C1 is encoded by an appropriate escape sequence to
3668    fit into the environment.  All control characters for C1 are
3669    defined to have corresponding escape sequences.
3670
3671    A graphic character set is at first designated to one of four
3672    graphic registers (G0 through G3), then these graphic registers are
3673    invoked to GL or GR.  These designations and invocations can be
3674    done independently.  The most common case is that G0 is invoked to
3675    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3676    these invocations and designations are omitted in encoded text.
3677    In a 7-bit environment, only GL can be used.
3678
3679    When a graphic character set of CHARS94 is invoked to GL, codes
3680    0x20 and 0x7F of the GL area work as control characters SPACE and
3681    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3682    be used.
3683
3684    There are two ways of invocation: locking-shift and single-shift.
3685    With locking-shift, the invocation lasts until the next different
3686    invocation, whereas with single-shift, the invocation affects the
3687    following character only and doesn't affect the locking-shift
3688    state.  Invocations are done by the following control characters or
3689    escape sequences:
3690
3691    ----------------------------------------------------------------------
3692    abbrev  function                  cntrl escape seq   description
3693    ----------------------------------------------------------------------
3694    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3695    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3696    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3697    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3698    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3699    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3700    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3701    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3702    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3703    ----------------------------------------------------------------------
3704    (*) These are not used by any known coding system.
3705
3706    Control characters for these functions are defined by macros
3707    ISO_CODE_XXX in `coding.h'.
3708
3709    Designations are done by the following escape sequences:
3710    ----------------------------------------------------------------------
3711    escape sequence      description
3712    ----------------------------------------------------------------------
3713    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3714    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3715    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3716    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3717    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
3718    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
3719    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
3720    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
3721    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
3722    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
3723    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
3724    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
3725    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
3726    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
3727    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
3728    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
3729    ----------------------------------------------------------------------
3730
3731    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3732    of dimension 1, chars 94, and final character <F>, etc...
3733
3734    Note (*): Although these designations are not allowed in ISO2022,
3735    Emacs accepts them on decoding, and produces them on encoding
3736    CHARS96 character sets in a coding system which is characterized as
3737    7-bit environment, non-locking-shift, and non-single-shift.
3738
3739    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3740    '(' can be omitted.  We refer to this as "short-form" hereafter.
3741
3742    Now you may notice that there are a lot of ways for encoding the
3743    same multilingual text in ISO2022.  Actually, there exist many
3744    coding systems such as Compound Text (used in X11's inter client
3745    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3746    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3747    localized platforms), and all of these are variants of ISO2022.
3748
3749    In addition to the above, Emacs handles two more kinds of escape
3750    sequences: ISO6429's direction specification and Emacs' private
3751    sequence for specifying character composition.
3752
3753    ISO6429's direction specification takes the following form:
3754         o CSI ']'      -- end of the current direction
3755         o CSI '0' ']'  -- end of the current direction
3756         o CSI '1' ']'  -- start of left-to-right text
3757         o CSI '2' ']'  -- start of right-to-left text
3758    The control character CSI (0x9B: control sequence introducer) is
3759    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3760
3761    Character composition specification takes the following form:
3762         o ESC '0' -- start character composition
3763         o ESC '1' -- end character composition
3764    Since these are not standard escape sequences of any ISO standard,
3765    their use with these meanings is restricted to Emacs only.  */
3766
3767 static void
3768 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3769 {
3770   int i;
3771
3772   for (i = 0; i < 4; i++)
3773     {
3774       if (!NILP (coding_system))
3775         iso->charset[i] =
3776           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3777       else
3778         iso->charset[i] = Qt;
3779       iso->invalid_designated[i] = 0;
3780     }
3781   iso->esc = ISO_ESC_NOTHING;
3782   iso->esc_bytes_index = 0;
3783   iso->register_left = 0;
3784   iso->register_right = 1;
3785   iso->switched_dir_and_no_valid_charset_yet = 0;
3786   iso->invalid_switch_dir = 0;
3787   iso->output_direction_sequence = 0;
3788   iso->output_literally = 0;
3789 #ifdef ENABLE_COMPOSITE_CHARS
3790   if (iso->composite_chars)
3791     Dynarr_reset (iso->composite_chars);
3792 #endif
3793 }
3794
3795 static int
3796 fit_to_be_escape_quoted (unsigned char c)
3797 {
3798   switch (c)
3799     {
3800     case ISO_CODE_ESC:
3801     case ISO_CODE_CSI:
3802     case ISO_CODE_SS2:
3803     case ISO_CODE_SS3:
3804     case ISO_CODE_SO:
3805     case ISO_CODE_SI:
3806       return 1;
3807
3808     default:
3809       return 0;
3810     }
3811 }
3812
3813 /* Parse one byte of an ISO2022 escape sequence.
3814    If the result is an invalid escape sequence, return 0 and
3815    do not change anything in STR.  Otherwise, if the result is
3816    an incomplete escape sequence, update ISO2022.ESC and
3817    ISO2022.ESC_BYTES and return -1.  Otherwise, update
3818    all the state variables (but not ISO2022.ESC_BYTES) and
3819    return 1.
3820
3821    If CHECK_INVALID_CHARSETS is non-zero, check for designation
3822    or invocation of an invalid character set and treat that as
3823    an unrecognized escape sequence. */
3824
3825 static int
3826 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3827                    unsigned char c, unsigned int *flags,
3828                    int check_invalid_charsets)
3829 {
3830   /* (1) If we're at the end of a designation sequence, CS is the
3831      charset being designated and REG is the register to designate
3832      it to.
3833
3834      (2) If we're at the end of a locking-shift sequence, REG is
3835      the register to invoke and HALF (0 == left, 1 == right) is
3836      the half to invoke it into.
3837
3838      (3) If we're at the end of a single-shift sequence, REG is
3839      the register to invoke. */
3840   Lisp_Object cs = Qnil;
3841   int reg, half;
3842
3843   /* NOTE: This code does goto's all over the fucking place.
3844      The reason for this is that we're basically implementing
3845      a state machine here, and hierarchical languages like C
3846      don't really provide a clean way of doing this. */
3847
3848   if (! (*flags & CODING_STATE_ESCAPE))
3849     /* At beginning of escape sequence; we need to reset our
3850        escape-state variables. */
3851     iso->esc = ISO_ESC_NOTHING;
3852
3853   iso->output_literally = 0;
3854   iso->output_direction_sequence = 0;
3855
3856   switch (iso->esc)
3857     {
3858     case ISO_ESC_NOTHING:
3859       iso->esc_bytes_index = 0;
3860       switch (c)
3861         {
3862         case ISO_CODE_ESC:      /* Start escape sequence */
3863           *flags |= CODING_STATE_ESCAPE;
3864           iso->esc = ISO_ESC;
3865           goto not_done;
3866
3867         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
3868           *flags |= CODING_STATE_ESCAPE;
3869           iso->esc = ISO_ESC_5_11;
3870           goto not_done;
3871
3872         case ISO_CODE_SO:       /* locking shift 1 */
3873           reg = 1; half = 0;
3874           goto locking_shift;
3875         case ISO_CODE_SI:       /* locking shift 0 */
3876           reg = 0; half = 0;
3877           goto locking_shift;
3878
3879         case ISO_CODE_SS2:      /* single shift */
3880           reg = 2;
3881           goto single_shift;
3882         case ISO_CODE_SS3:      /* single shift */
3883           reg = 3;
3884           goto single_shift;
3885
3886         default:                        /* Other control characters */
3887           return 0;
3888         }
3889
3890     case ISO_ESC:
3891       switch (c)
3892         {
3893           /**** single shift ****/
3894
3895         case 'N':       /* single shift 2 */
3896           reg = 2;
3897           goto single_shift;
3898         case 'O':       /* single shift 3 */
3899           reg = 3;
3900           goto single_shift;
3901
3902           /**** locking shift ****/
3903
3904         case '~':       /* locking shift 1 right */
3905           reg = 1; half = 1;
3906           goto locking_shift;
3907         case 'n':       /* locking shift 2 */
3908           reg = 2; half = 0;
3909           goto locking_shift;
3910         case '}':       /* locking shift 2 right */
3911           reg = 2; half = 1;
3912           goto locking_shift;
3913         case 'o':       /* locking shift 3 */
3914           reg = 3; half = 0;
3915           goto locking_shift;
3916         case '|':       /* locking shift 3 right */
3917           reg = 3; half = 1;
3918           goto locking_shift;
3919
3920 #ifdef ENABLE_COMPOSITE_CHARS
3921           /**** composite ****/
3922
3923         case '0':
3924           iso->esc = ISO_ESC_START_COMPOSITE;
3925           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3926             CODING_STATE_COMPOSITE;
3927           return 1;
3928
3929         case '1':
3930           iso->esc = ISO_ESC_END_COMPOSITE;
3931           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3932             ~CODING_STATE_COMPOSITE;
3933           return 1;
3934 #endif /* ENABLE_COMPOSITE_CHARS */
3935
3936           /**** directionality ****/
3937
3938         case '[':
3939           iso->esc = ISO_ESC_5_11;
3940           goto not_done;
3941
3942           /**** designation ****/
3943
3944         case '$':       /* multibyte charset prefix */
3945           iso->esc = ISO_ESC_2_4;
3946           goto not_done;
3947
3948         default:
3949           if (0x28 <= c && c <= 0x2F)
3950             {
3951               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3952               goto not_done;
3953             }
3954
3955           /* This function is called with CODESYS equal to nil when
3956              doing coding-system detection. */
3957           if (!NILP (codesys)
3958               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3959               && fit_to_be_escape_quoted (c))
3960             {
3961               iso->esc = ISO_ESC_LITERAL;
3962               *flags &= CODING_STATE_ISO2022_LOCK;
3963               return 1;
3964             }
3965
3966           /* bzzzt! */
3967           return 0;
3968         }
3969
3970
3971
3972       /**** directionality ****/
3973
3974     case ISO_ESC_5_11:          /* ISO6429 direction control */
3975       if (c == ']')
3976         {
3977           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3978           goto directionality;
3979         }
3980       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
3981       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3982       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3983       else               return 0;
3984       goto not_done;
3985
3986     case ISO_ESC_5_11_0:
3987       if (c == ']')
3988         {
3989           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3990           goto directionality;
3991         }
3992       return 0;
3993
3994     case ISO_ESC_5_11_1:
3995       if (c == ']')
3996         {
3997           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3998           goto directionality;
3999         }
4000       return 0;
4001
4002     case ISO_ESC_5_11_2:
4003       if (c == ']')
4004         {
4005           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4006           goto directionality;
4007         }
4008       return 0;
4009
4010     directionality:
4011       iso->esc = ISO_ESC_DIRECTIONALITY;
4012       /* Various junk here to attempt to preserve the direction sequences
4013          literally in the text if they would otherwise be swallowed due
4014          to invalid designations that don't show up as actual charset
4015          changes in the text. */
4016       if (iso->invalid_switch_dir)
4017         {
4018           /* We already inserted a direction switch literally into the
4019              text.  We assume (#### this may not be right) that the
4020              next direction switch is the one going the other way,
4021              and we need to output that literally as well. */
4022           iso->output_literally = 1;
4023           iso->invalid_switch_dir = 0;
4024         }
4025       else
4026         {
4027           int jj;
4028
4029           /* If we are in the thrall of an invalid designation,
4030            then stick the directionality sequence literally into the
4031            output stream so it ends up in the original text again. */
4032           for (jj = 0; jj < 4; jj++)
4033             if (iso->invalid_designated[jj])
4034               break;
4035           if (jj < 4)
4036             {
4037               iso->output_literally = 1;
4038               iso->invalid_switch_dir = 1;
4039             }
4040           else
4041             /* Indicate that we haven't yet seen a valid designation,
4042                so that if a switch-dir is directly followed by an
4043                invalid designation, both get inserted literally. */
4044             iso->switched_dir_and_no_valid_charset_yet = 1;
4045         }
4046       return 1;
4047
4048
4049       /**** designation ****/
4050
4051     case ISO_ESC_2_4:
4052       if (0x28 <= c && c <= 0x2F)
4053         {
4054           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4055           goto not_done;
4056         }
4057       if (0x40 <= c && c <= 0x42)
4058         {
4059           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4060                                       *flags & CODING_STATE_R2L ?
4061                                       CHARSET_RIGHT_TO_LEFT :
4062                                       CHARSET_LEFT_TO_RIGHT);
4063           reg = 0;
4064           goto designated;
4065         }
4066       return 0;
4067
4068     default:
4069       {
4070         int type =-1;
4071
4072         if (c < '0' || c > '~')
4073           return 0; /* bad final byte */
4074
4075         if (iso->esc >= ISO_ESC_2_8 &&
4076             iso->esc <= ISO_ESC_2_15)
4077           {
4078             type = ((iso->esc >= ISO_ESC_2_12) ?
4079                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4080             reg = (iso->esc - ISO_ESC_2_8) & 3;
4081           }
4082         else if (iso->esc >= ISO_ESC_2_4_8 &&
4083                  iso->esc <= ISO_ESC_2_4_15)
4084           {
4085             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4086                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4087             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4088           }
4089         else
4090           {
4091             /* Can this ever be reached? -slb */
4092             abort();
4093           }
4094
4095         cs = CHARSET_BY_ATTRIBUTES (type, c,
4096                                     *flags & CODING_STATE_R2L ?
4097                                     CHARSET_RIGHT_TO_LEFT :
4098                                     CHARSET_LEFT_TO_RIGHT);
4099         goto designated;
4100       }
4101     }
4102
4103  not_done:
4104   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4105   return -1;
4106
4107  single_shift:
4108   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4109     /* can't invoke something that ain't there. */
4110     return 0;
4111   iso->esc = ISO_ESC_SINGLE_SHIFT;
4112   *flags &= CODING_STATE_ISO2022_LOCK;
4113   if (reg == 2)
4114     *flags |= CODING_STATE_SS2;
4115   else
4116     *flags |= CODING_STATE_SS3;
4117   return 1;
4118
4119  locking_shift:
4120   if (check_invalid_charsets &&
4121       !CHARSETP (iso->charset[reg]))
4122     /* can't invoke something that ain't there. */
4123     return 0;
4124   if (half)
4125     iso->register_right = reg;
4126   else
4127     iso->register_left = reg;
4128   *flags &= CODING_STATE_ISO2022_LOCK;
4129   iso->esc = ISO_ESC_LOCKING_SHIFT;
4130   return 1;
4131
4132  designated:
4133   if (NILP (cs) && check_invalid_charsets)
4134     {
4135       iso->invalid_designated[reg] = 1;
4136       iso->charset[reg] = Vcharset_ascii;
4137       iso->esc = ISO_ESC_DESIGNATE;
4138       *flags &= CODING_STATE_ISO2022_LOCK;
4139       iso->output_literally = 1;
4140       if (iso->switched_dir_and_no_valid_charset_yet)
4141         {
4142           /* We encountered a switch-direction followed by an
4143              invalid designation.  Ensure that the switch-direction
4144              gets outputted; otherwise it will probably get eaten
4145              when the text is written out again. */
4146           iso->switched_dir_and_no_valid_charset_yet = 0;
4147           iso->output_direction_sequence = 1;
4148           /* And make sure that the switch-dir going the other
4149              way gets outputted, as well. */
4150           iso->invalid_switch_dir = 1;
4151         }
4152       return 1;
4153     }
4154   /* This function is called with CODESYS equal to nil when
4155      doing coding-system detection. */
4156   if (!NILP (codesys))
4157     {
4158       charset_conversion_spec_dynarr *dyn =
4159         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4160
4161       if (dyn)
4162         {
4163           int i;
4164
4165           for (i = 0; i < Dynarr_length (dyn); i++)
4166             {
4167               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4168               if (EQ (cs, spec->from_charset))
4169                 cs = spec->to_charset;
4170             }
4171         }
4172     }
4173
4174   iso->charset[reg] = cs;
4175   iso->esc = ISO_ESC_DESIGNATE;
4176   *flags &= CODING_STATE_ISO2022_LOCK;
4177   if (iso->invalid_designated[reg])
4178     {
4179       iso->invalid_designated[reg] = 0;
4180       iso->output_literally = 1;
4181     }
4182   if (iso->switched_dir_and_no_valid_charset_yet)
4183     iso->switched_dir_and_no_valid_charset_yet = 0;
4184   return 1;
4185 }
4186
4187 static int
4188 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4189                        unsigned int n)
4190 {
4191   int mask;
4192
4193   /* #### There are serious deficiencies in the recognition mechanism
4194      here.  This needs to be much smarter if it's going to cut it.
4195      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4196      it should be detected as Latin-1.
4197      All the ISO2022 stuff in this file should be synced up with the
4198      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4199      Perhaps we should wait till R2L works in FSF Emacs? */
4200
4201   if (!st->iso2022.initted)
4202     {
4203       reset_iso2022 (Qnil, &st->iso2022.iso);
4204       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4205                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4206                           CODING_CATEGORY_ISO_8_1_MASK |
4207                           CODING_CATEGORY_ISO_8_2_MASK |
4208                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4209       st->iso2022.flags = 0;
4210       st->iso2022.high_byte_count = 0;
4211       st->iso2022.saw_single_shift = 0;
4212       st->iso2022.initted = 1;
4213     }
4214
4215   mask = st->iso2022.mask;
4216
4217   while (n--)
4218     {
4219       int c = *src++;
4220       if (c >= 0xA0)
4221         {
4222           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4223           st->iso2022.high_byte_count++;
4224         }
4225       else
4226         {
4227           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4228             {
4229               if (st->iso2022.high_byte_count & 1)
4230                 /* odd number of high bytes; assume not iso-8-2 */
4231                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4232             }
4233           st->iso2022.high_byte_count = 0;
4234           st->iso2022.saw_single_shift = 0;
4235           if (c > 0x80)
4236             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4237         }
4238       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4239           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4240         { /* control chars */
4241           switch (c)
4242             {
4243               /* Allow and ignore control characters that you might
4244                  reasonably see in a text file */
4245             case '\r':
4246             case '\n':
4247             case '\t':
4248             case  7: /* bell */
4249             case  8: /* backspace */
4250             case 11: /* vertical tab */
4251             case 12: /* form feed */
4252             case 26: /* MS-DOS C-z junk */
4253             case 31: /* '^_' -- for info */
4254               goto label_continue_loop;
4255
4256             default:
4257               break;
4258             }
4259         }
4260
4261       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4262           || BYTE_C1_P (c))
4263         {
4264           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4265                                  &st->iso2022.flags, 0))
4266             {
4267               switch (st->iso2022.iso.esc)
4268                 {
4269                 case ISO_ESC_DESIGNATE:
4270                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4271                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4272                   break;
4273                 case ISO_ESC_LOCKING_SHIFT:
4274                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4275                   goto ran_out_of_chars;
4276                 case ISO_ESC_SINGLE_SHIFT:
4277                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4278                   st->iso2022.saw_single_shift = 1;
4279                   break;
4280                 default:
4281                   break;
4282                 }
4283             }
4284           else
4285             {
4286               mask = 0;
4287               goto ran_out_of_chars;
4288             }
4289         }
4290     label_continue_loop:;
4291     }
4292
4293  ran_out_of_chars:
4294
4295   return mask;
4296 }
4297
4298 static int
4299 postprocess_iso2022_mask (int mask)
4300 {
4301   /* #### kind of cheesy */
4302   /* If seven-bit ISO is allowed, then assume that the encoding is
4303      entirely seven-bit and turn off the eight-bit ones. */
4304   if (mask & CODING_CATEGORY_ISO_7_MASK)
4305     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4306                CODING_CATEGORY_ISO_8_1_MASK |
4307                CODING_CATEGORY_ISO_8_2_MASK);
4308   return mask;
4309 }
4310
4311 /* If FLAGS is a null pointer or specifies right-to-left motion,
4312    output a switch-dir-to-left-to-right sequence to DST.
4313    Also update FLAGS if it is not a null pointer.
4314    If INTERNAL_P is set, we are outputting in internal format and
4315    need to handle the CSI differently. */
4316
4317 static void
4318 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4319                                  unsigned_char_dynarr *dst,
4320                                  unsigned int *flags,
4321                                  int internal_p)
4322 {
4323   if (!flags || (*flags & CODING_STATE_R2L))
4324     {
4325       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4326         {
4327           Dynarr_add (dst, ISO_CODE_ESC);
4328           Dynarr_add (dst, '[');
4329         }
4330       else if (internal_p)
4331         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4332       else
4333         Dynarr_add (dst, ISO_CODE_CSI);
4334       Dynarr_add (dst, '0');
4335       Dynarr_add (dst, ']');
4336       if (flags)
4337         *flags &= ~CODING_STATE_R2L;
4338     }
4339 }
4340
4341 /* If FLAGS is a null pointer or specifies a direction different from
4342    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4343    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4344    sequence to DST.  Also update FLAGS if it is not a null pointer.
4345    If INTERNAL_P is set, we are outputting in internal format and
4346    need to handle the CSI differently. */
4347
4348 static void
4349 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4350                           unsigned_char_dynarr *dst, unsigned int *flags,
4351                           int internal_p)
4352 {
4353   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4354       direction == CHARSET_LEFT_TO_RIGHT)
4355     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4356   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4357            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4358            direction == CHARSET_RIGHT_TO_LEFT)
4359     {
4360       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4361         {
4362           Dynarr_add (dst, ISO_CODE_ESC);
4363           Dynarr_add (dst, '[');
4364         }
4365       else if (internal_p)
4366         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4367       else
4368         Dynarr_add (dst, ISO_CODE_CSI);
4369       Dynarr_add (dst, '2');
4370       Dynarr_add (dst, ']');
4371       if (flags)
4372         *flags |= CODING_STATE_R2L;
4373     }
4374 }
4375
4376 /* Convert ISO2022-format data to internal format. */
4377
4378 static void
4379 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4380                        unsigned_char_dynarr *dst, unsigned int n)
4381 {
4382   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4383   unsigned int flags  = str->flags;
4384   unsigned int ch     = str->ch;
4385   eol_type_t eol_type = str->eol_type;
4386 #ifdef ENABLE_COMPOSITE_CHARS
4387   unsigned_char_dynarr *real_dst = dst;
4388 #endif
4389   Lisp_Object coding_system;
4390
4391   XSETCODING_SYSTEM (coding_system, str->codesys);
4392
4393 #ifdef ENABLE_COMPOSITE_CHARS
4394   if (flags & CODING_STATE_COMPOSITE)
4395     dst = str->iso2022.composite_chars;
4396 #endif /* ENABLE_COMPOSITE_CHARS */
4397
4398   while (n--)
4399     {
4400       unsigned char c = *src++;
4401       if (flags & CODING_STATE_ESCAPE)
4402         {       /* Within ESC sequence */
4403           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4404                                           c, &flags, 1);
4405
4406           if (retval)
4407             {
4408               switch (str->iso2022.esc)
4409                 {
4410 #ifdef ENABLE_COMPOSITE_CHARS
4411                 case ISO_ESC_START_COMPOSITE:
4412                   if (str->iso2022.composite_chars)
4413                     Dynarr_reset (str->iso2022.composite_chars);
4414                   else
4415                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4416                   dst = str->iso2022.composite_chars;
4417                   break;
4418                 case ISO_ESC_END_COMPOSITE:
4419                   {
4420                     Bufbyte comstr[MAX_EMCHAR_LEN];
4421                     Bytecount len;
4422                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4423                                                          Dynarr_length (dst));
4424                     dst = real_dst;
4425                     len = set_charptr_emchar (comstr, emch);
4426                     Dynarr_add_many (dst, comstr, len);
4427                     break;
4428                   }
4429 #endif /* ENABLE_COMPOSITE_CHARS */
4430
4431                 case ISO_ESC_LITERAL:
4432                   DECODE_ADD_BINARY_CHAR (c, dst);
4433                   break;
4434
4435                 default:
4436                   /* Everything else handled already */
4437                   break;
4438                 }
4439             }
4440
4441           /* Attempted error recovery. */
4442           if (str->iso2022.output_direction_sequence)
4443             ensure_correct_direction (flags & CODING_STATE_R2L ?
4444                                       CHARSET_RIGHT_TO_LEFT :
4445                                       CHARSET_LEFT_TO_RIGHT,
4446                                       str->codesys, dst, 0, 1);
4447           /* More error recovery. */
4448           if (!retval || str->iso2022.output_literally)
4449             {
4450               /* Output the (possibly invalid) sequence */
4451               int i;
4452               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4453                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4454               flags &= CODING_STATE_ISO2022_LOCK;
4455               if (!retval)
4456                 n++, src--;/* Repeat the loop with the same character. */
4457               else
4458                 {
4459                   /* No sense in reprocessing the final byte of the
4460                      escape sequence; it could mess things up anyway.
4461                      Just add it now. */
4462                   DECODE_ADD_BINARY_CHAR (c, dst);
4463                 }
4464             }
4465           ch = 0;
4466         }
4467       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4468         { /* Control characters */
4469
4470           /***** Error-handling *****/
4471
4472           /* If we were in the middle of a character, dump out the
4473              partial character. */
4474           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4475
4476           /* If we just saw a single-shift character, dump it out.
4477              This may dump out the wrong sort of single-shift character,
4478              but least it will give an indication that something went
4479              wrong. */
4480           if (flags & CODING_STATE_SS2)
4481             {
4482               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4483               flags &= ~CODING_STATE_SS2;
4484             }
4485           if (flags & CODING_STATE_SS3)
4486             {
4487               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4488               flags &= ~CODING_STATE_SS3;
4489             }
4490
4491           /***** Now handle the control characters. *****/
4492
4493           /* Handle CR/LF */
4494           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4495
4496           flags &= CODING_STATE_ISO2022_LOCK;
4497
4498           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4499             DECODE_ADD_BINARY_CHAR (c, dst);
4500         }
4501       else
4502         {                       /* Graphic characters */
4503           Lisp_Object charset;
4504 #ifndef UTF2000
4505           Charset_ID lb;
4506 #endif
4507           int reg;
4508
4509           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4510
4511           /* Now determine the charset. */
4512           reg = ((flags & CODING_STATE_SS2) ? 2
4513                  : (flags & CODING_STATE_SS3) ? 3
4514                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4515                  : str->iso2022.register_left);
4516           charset = str->iso2022.charset[reg];
4517
4518           /* Error checking: */
4519           if (! CHARSETP (charset)
4520               || str->iso2022.invalid_designated[reg]
4521               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4522                   && XCHARSET_CHARS (charset) == 94))
4523             /* Mrmph.  We are trying to invoke a register that has no
4524                or an invalid charset in it, or trying to add a character
4525                outside the range of the charset.  Insert that char literally
4526                to preserve it for the output. */
4527             {
4528               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4529               DECODE_ADD_BINARY_CHAR (c, dst);
4530             }
4531
4532           else
4533             {
4534               /* Things are probably hunky-dorey. */
4535
4536               /* Fetch reverse charset, maybe. */
4537               if (((flags & CODING_STATE_R2L) &&
4538                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4539                   ||
4540                   (!(flags & CODING_STATE_R2L) &&
4541                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4542                 {
4543                   Lisp_Object new_charset =
4544                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4545                   if (!NILP (new_charset))
4546                     charset = new_charset;
4547                 }
4548
4549 #ifdef UTF2000
4550               if (XCHARSET_DIMENSION (charset) == 1)
4551                 {
4552                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4553                   DECODE_ADD_UCS_CHAR
4554                     (MAKE_CHAR (charset, c & 0x7F, 0), dst);
4555                 }
4556               else if (ch)
4557                 {
4558                   DECODE_ADD_UCS_CHAR
4559                     (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
4560                   ch = 0;
4561                 }
4562               else
4563                 ch = c;
4564 #else
4565               lb = XCHARSET_LEADING_BYTE (charset);
4566               switch (XCHARSET_REP_BYTES (charset))
4567                 {
4568                 case 1: /* ASCII */
4569                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4570                   Dynarr_add (dst, c & 0x7F);
4571                   break;
4572
4573                 case 2: /* one-byte official */
4574                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4575                   Dynarr_add (dst, lb);
4576                   Dynarr_add (dst, c | 0x80);
4577                   break;
4578
4579                 case 3: /* one-byte private or two-byte official */
4580                   if (XCHARSET_PRIVATE_P (charset))
4581                     {
4582                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4583                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4584                       Dynarr_add (dst, lb);
4585                       Dynarr_add (dst, c | 0x80);
4586                     }
4587                   else
4588                     {
4589                       if (ch)
4590                         {
4591                           Dynarr_add (dst, lb);
4592                           Dynarr_add (dst, ch | 0x80);
4593                           Dynarr_add (dst, c | 0x80);
4594                           ch = 0;
4595                         }
4596                       else
4597                         ch = c;
4598                     }
4599                   break;
4600
4601                 default:        /* two-byte private */
4602                   if (ch)
4603                     {
4604                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4605                       Dynarr_add (dst, lb);
4606                       Dynarr_add (dst, ch | 0x80);
4607                       Dynarr_add (dst, c | 0x80);
4608                       ch = 0;
4609                     }
4610                   else
4611                     ch = c;
4612                 }
4613 #endif
4614             }
4615
4616           if (!ch)
4617             flags &= CODING_STATE_ISO2022_LOCK;
4618         }
4619
4620     label_continue_loop:;
4621     }
4622
4623   if (flags & CODING_STATE_END)
4624     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4625
4626   str->flags = flags;
4627   str->ch    = ch;
4628 }
4629
4630
4631 /***** ISO2022 encoder *****/
4632
4633 /* Designate CHARSET into register REG. */
4634
4635 static void
4636 iso2022_designate (Lisp_Object charset, unsigned char reg,
4637                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4638 {
4639   static CONST char inter94[] = "()*+";
4640   static CONST char inter96[] = ",-./";
4641   unsigned int type;
4642   unsigned char final;
4643   Lisp_Object old_charset = str->iso2022.charset[reg];
4644
4645   str->iso2022.charset[reg] = charset;
4646   if (!CHARSETP (charset))
4647     /* charset might be an initial nil or t. */
4648     return;
4649   type = XCHARSET_TYPE (charset);
4650   final = XCHARSET_FINAL (charset);
4651   if (!str->iso2022.force_charset_on_output[reg] &&
4652       CHARSETP (old_charset) &&
4653       XCHARSET_TYPE (old_charset) == type &&
4654       XCHARSET_FINAL (old_charset) == final)
4655     return;
4656
4657   str->iso2022.force_charset_on_output[reg] = 0;
4658
4659   {
4660     charset_conversion_spec_dynarr *dyn =
4661       str->codesys->iso2022.output_conv;
4662
4663     if (dyn)
4664       {
4665         int i;
4666
4667         for (i = 0; i < Dynarr_length (dyn); i++)
4668           {
4669             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4670             if (EQ (charset, spec->from_charset))
4671                 charset = spec->to_charset;
4672           }
4673       }
4674   }
4675
4676   Dynarr_add (dst, ISO_CODE_ESC);
4677   switch (type)
4678     {
4679     case CHARSET_TYPE_94:
4680       Dynarr_add (dst, inter94[reg]);
4681       break;
4682     case CHARSET_TYPE_96:
4683       Dynarr_add (dst, inter96[reg]);
4684       break;
4685     case CHARSET_TYPE_94X94:
4686       Dynarr_add (dst, '$');
4687       if (reg != 0
4688           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4689           || final < '@'
4690           || final > 'B')
4691         Dynarr_add (dst, inter94[reg]);
4692       break;
4693     case CHARSET_TYPE_96X96:
4694       Dynarr_add (dst, '$');
4695       Dynarr_add (dst, inter96[reg]);
4696       break;
4697     }
4698   Dynarr_add (dst, final);
4699 }
4700
4701 static void
4702 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4703 {
4704   if (str->iso2022.register_left != 0)
4705     {
4706       Dynarr_add (dst, ISO_CODE_SI);
4707       str->iso2022.register_left = 0;
4708     }
4709 }
4710
4711 static void
4712 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4713 {
4714   if (str->iso2022.register_left != 1)
4715     {
4716       Dynarr_add (dst, ISO_CODE_SO);
4717       str->iso2022.register_left = 1;
4718     }
4719 }
4720
4721 void
4722 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4723                      unsigned_char_dynarr *dst, unsigned int *flags)
4724 {
4725   unsigned char charmask;
4726   Lisp_Coding_System* codesys = str->codesys;
4727   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
4728   int i;
4729   Lisp_Object charset = str->iso2022.current_charset;
4730   int half = str->iso2022.current_half;
4731   unsigned int byte1, byte2;
4732
4733   if (ch <= 0x7F)
4734     {
4735       restore_left_to_right_direction (codesys, dst, flags, 0);
4736
4737       /* Make sure G0 contains ASCII */
4738       if ((ch > ' ' && ch < ISO_CODE_DEL)
4739           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4740         {
4741           ensure_normal_shift (str, dst);
4742           iso2022_designate (Vcharset_ascii, 0, str, dst);
4743         }
4744
4745       /* If necessary, restore everything to the default state
4746          at end-of-line */
4747       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4748         {
4749           restore_left_to_right_direction (codesys, dst, flags, 0);
4750
4751           ensure_normal_shift (str, dst);
4752
4753           for (i = 0; i < 4; i++)
4754             {
4755               Lisp_Object initial_charset =
4756                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4757               iso2022_designate (initial_charset, i, str, dst);
4758             }
4759         }
4760       if (ch == '\n')
4761         {
4762           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4763             Dynarr_add (dst, '\r');
4764           if (eol_type != EOL_CR)
4765             Dynarr_add (dst, ch);
4766         }
4767       else
4768         {
4769           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4770               && fit_to_be_escape_quoted (ch))
4771             Dynarr_add (dst, ISO_CODE_ESC);
4772           Dynarr_add (dst, ch);
4773         }
4774     }
4775   else if ( (0x80 <= ch) && (ch <= 0x9f) )
4776     {
4777       charmask = (half == 0 ? 0x00 : 0x80);
4778
4779       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4780           && fit_to_be_escape_quoted (ch))
4781         Dynarr_add (dst, ISO_CODE_ESC);
4782       /* you asked for it ... */
4783       Dynarr_add (dst, ch);
4784     }
4785   else
4786     {
4787       int reg;
4788
4789       BREAKUP_CHAR (ch, charset, byte1, byte2);
4790       ensure_correct_direction (XCHARSET_DIRECTION (charset),
4791                                 codesys, dst, flags, 0);
4792
4793       /* Now determine which register to use. */
4794       reg = -1;
4795       for (i = 0; i < 4; i++)
4796         {
4797           if (EQ (charset, str->iso2022.charset[i]) ||
4798               EQ (charset,
4799                   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4800             {
4801               reg = i;
4802               break;
4803             }
4804         }
4805
4806       if (reg == -1)
4807         {
4808           if (XCHARSET_GRAPHIC (charset) != 0)
4809             {
4810               if (!NILP (str->iso2022.charset[1]) &&
4811                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
4812                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4813                 reg = 1;
4814               else if (!NILP (str->iso2022.charset[2]))
4815                 reg = 2;
4816               else if (!NILP (str->iso2022.charset[3]))
4817                 reg = 3;
4818               else
4819                 reg = 0;
4820             }
4821           else
4822             reg = 0;
4823         }
4824
4825       iso2022_designate (charset, reg, str, dst);
4826
4827       /* Now invoke that register. */
4828       switch (reg)
4829         {
4830         case 0:
4831           ensure_normal_shift (str, dst);
4832           half = 0;
4833           break;
4834         case 1:
4835           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4836             {
4837               ensure_shift_out (str, dst);
4838               half = 0;
4839             }
4840           else
4841             half = 1;
4842           break;
4843         case 2:
4844           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4845             {
4846               Dynarr_add (dst, ISO_CODE_ESC);
4847               Dynarr_add (dst, 'N');
4848               half = 0;
4849             }
4850           else
4851             {
4852               Dynarr_add (dst, ISO_CODE_SS2);
4853               half = 1;
4854             }
4855           break;
4856         case 3:
4857           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4858             {
4859               Dynarr_add (dst, ISO_CODE_ESC);
4860               Dynarr_add (dst, 'O');
4861               half = 0;
4862             }
4863           else
4864             {
4865               Dynarr_add (dst, ISO_CODE_SS3);
4866               half = 1;
4867             }
4868           break;
4869         default:
4870           abort ();
4871         }
4872
4873       charmask = (half == 0 ? 0x00 : 0x80);
4874
4875       switch (XCHARSET_DIMENSION (charset))
4876         {
4877         case 1:
4878           Dynarr_add (dst, byte1 | charmask);
4879           break;
4880         case 2:
4881           Dynarr_add (dst, byte1 | charmask);
4882           Dynarr_add (dst, byte2 | charmask);
4883           break;
4884         default:
4885           abort ();
4886         }
4887     }
4888   str->iso2022.current_charset = charset;
4889   str->iso2022.current_half = half;
4890 }
4891
4892 void
4893 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4894                      unsigned int *flags)
4895 {
4896   Lisp_Coding_System* codesys = str->codesys;
4897   int i;
4898
4899   restore_left_to_right_direction (codesys, dst, flags, 0);
4900   ensure_normal_shift (str, dst);
4901   for (i = 0; i < 4; i++)
4902     {
4903       Lisp_Object initial_charset
4904         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4905       iso2022_designate (initial_charset, i, str, dst);
4906     }
4907 }
4908 #endif /* MULE */
4909 \f
4910 /************************************************************************/
4911 /*                     No-conversion methods                            */
4912 /************************************************************************/
4913
4914 /* This is used when reading in "binary" files -- i.e. files that may
4915    contain all 256 possible byte values and that are not to be
4916    interpreted as being in any particular decoding. */
4917 static void
4918 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4919                              unsigned_char_dynarr *dst, unsigned int n)
4920 {
4921   unsigned char c;
4922   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4923   unsigned int flags  = str->flags;
4924   unsigned int ch     = str->ch;
4925   eol_type_t eol_type = str->eol_type;
4926
4927   while (n--)
4928     {
4929       c = *src++;
4930
4931       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4932       DECODE_ADD_BINARY_CHAR (c, dst);
4933     label_continue_loop:;
4934     }
4935
4936   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4937
4938   str->flags = flags;
4939   str->ch    = ch;
4940 }
4941
4942 static void
4943 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4944                              unsigned_char_dynarr *dst, unsigned int n)
4945 {
4946   unsigned char c;
4947   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4948   unsigned int flags  = str->flags;
4949   unsigned int ch     = str->ch;
4950   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4951 #ifdef UTF2000
4952   unsigned char char_boundary = str->iso2022.current_char_boundary;
4953 #endif
4954
4955   while (n--)
4956     {
4957       c = *src++;
4958 #ifdef UTF2000
4959       switch (char_boundary)
4960         {
4961         case 0:
4962           if ( c >= 0xfc )
4963             {
4964               ch = c & 0x01;
4965               char_boundary = 5;
4966             }
4967           else if ( c >= 0xf8 )
4968             {
4969               ch = c & 0x03;
4970               char_boundary = 4;
4971             }
4972           else if ( c >= 0xf0 )
4973             {
4974               ch = c & 0x07;
4975               char_boundary = 3;
4976             }
4977           else if ( c >= 0xe0 )
4978             {
4979               ch = c & 0x0f;
4980               char_boundary = 2;
4981             }
4982           else if ( c >= 0xc0 )
4983             {
4984               ch = c & 0x1f;
4985               char_boundary = 1;
4986             }
4987           else
4988             {
4989               ch = 0;
4990
4991               if (c == '\n')
4992                 {
4993                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4994                     Dynarr_add (dst, '\r');
4995                   if (eol_type != EOL_CR)
4996                     Dynarr_add (dst, c);
4997                 }
4998               else
4999                 Dynarr_add (dst, c);
5000               char_boundary = 0;
5001             }
5002           break;
5003         case 1:
5004           ch = ( ch << 6 ) | ( c & 0x3f );
5005           switch ( str->codesys->fixed.size )
5006             {
5007             case 1:
5008               Dynarr_add (dst, ch & 0xff);
5009               break;
5010             case 2:
5011               Dynarr_add (dst, (ch >> 8) & 0xff);
5012               Dynarr_add (dst,  ch       & 0xff);
5013               break;
5014             case 3:
5015               Dynarr_add (dst, (ch >> 16) & 0xff);
5016               Dynarr_add (dst, (ch >>  8) & 0xff);
5017               Dynarr_add (dst,  ch        & 0xff);
5018               break;
5019             case 4:
5020               Dynarr_add (dst, (ch >> 24) & 0xff);
5021               Dynarr_add (dst, (ch >> 16) & 0xff);
5022               Dynarr_add (dst, (ch >>  8) & 0xff);
5023               Dynarr_add (dst,  ch        & 0xff);
5024               break;
5025             default:
5026               fprintf(stderr, "It seems %d bytes stream.\n",
5027                       str->codesys->fixed.size);
5028               abort ();
5029             }
5030           char_boundary = 0;
5031           break;
5032         default:
5033           ch = ( ch << 6 ) | ( c & 0x3f );
5034           char_boundary--;
5035         }
5036 #else /* not UTF2000 */
5037       if (c == '\n')
5038         {
5039           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5040             Dynarr_add (dst, '\r');
5041           if (eol_type != EOL_CR)
5042             Dynarr_add (dst, '\n');
5043           ch = 0;
5044         }
5045       else if (BYTE_ASCII_P (c))
5046         {
5047           assert (ch == 0);
5048           Dynarr_add (dst, c);
5049         }
5050       else if (BUFBYTE_LEADING_BYTE_P (c))
5051         {
5052           assert (ch == 0);
5053           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5054               c == LEADING_BYTE_CONTROL_1)
5055             ch = c;
5056           else
5057             Dynarr_add (dst, '~'); /* untranslatable character */
5058         }
5059       else
5060         {
5061           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5062             Dynarr_add (dst, c);
5063           else if (ch == LEADING_BYTE_CONTROL_1)
5064             {
5065               assert (c < 0xC0);
5066               Dynarr_add (dst, c - 0x20);
5067             }
5068           /* else it should be the second or third byte of an
5069              untranslatable character, so ignore it */
5070           ch = 0;
5071         }
5072 #endif /* not UTF2000 */
5073     }
5074
5075   str->flags = flags;
5076   str->ch    = ch;
5077 #ifdef UTF2000
5078   str->iso2022.current_char_boundary = char_boundary;
5079 #endif
5080 }
5081
5082 \f
5083 /************************************************************************/
5084 /*                   Simple internal/external functions                 */
5085 /************************************************************************/
5086
5087 static Extbyte_dynarr *conversion_out_dynarr;
5088 static Bufbyte_dynarr *conversion_in_dynarr;
5089
5090 /* Determine coding system from coding format */
5091
5092 /* #### not correct for all values of `fmt'! */
5093 static Lisp_Object
5094 external_data_format_to_coding_system (enum external_data_format fmt)
5095 {
5096   switch (fmt)
5097     {
5098     case FORMAT_FILENAME:
5099     case FORMAT_TERMINAL:
5100       if (EQ (Vfile_name_coding_system, Qnil) ||
5101           EQ (Vfile_name_coding_system, Qbinary))
5102         return Qnil;
5103       else
5104         return Fget_coding_system (Vfile_name_coding_system);
5105 #ifdef MULE
5106     case FORMAT_CTEXT:
5107       return Fget_coding_system (Qctext);
5108 #endif
5109     default:
5110       return Qnil;
5111     }
5112 }
5113
5114 Extbyte *
5115 convert_to_external_format (CONST Bufbyte *ptr,
5116                             Bytecount len,
5117                             Extcount *len_out,
5118                             enum external_data_format fmt)
5119 {
5120   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5121
5122   if (!conversion_out_dynarr)
5123     conversion_out_dynarr = Dynarr_new (Extbyte);
5124   else
5125     Dynarr_reset (conversion_out_dynarr);
5126
5127   if (NILP (coding_system))
5128     {
5129       CONST Bufbyte *end = ptr + len;
5130
5131       for (; ptr < end;)
5132         {
5133 #ifdef UTF2000
5134           Bufbyte c =
5135             (*ptr < 0xc0) ? *ptr :
5136             ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5137 #else
5138           Bufbyte c =
5139             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5140             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5141             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5142             '~';
5143 #endif
5144           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5145           INC_CHARPTR (ptr);
5146         }
5147
5148 #ifdef ERROR_CHECK_BUFPOS
5149       assert (ptr == end);
5150 #endif
5151     }
5152   else
5153     {
5154       Lisp_Object instream, outstream, da_outstream;
5155       Lstream *istr, *ostr;
5156       struct gcpro gcpro1, gcpro2, gcpro3;
5157       char tempbuf[1024]; /* some random amount */
5158
5159       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5160       da_outstream = make_dynarr_output_stream
5161         ((unsigned_char_dynarr *) conversion_out_dynarr);
5162       outstream =
5163         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5164       istr = XLSTREAM (instream);
5165       ostr = XLSTREAM (outstream);
5166       GCPRO3 (instream, outstream, da_outstream);
5167       while (1)
5168         {
5169           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5170           if (!size_in_bytes)
5171             break;
5172           Lstream_write (ostr, tempbuf, size_in_bytes);
5173         }
5174       Lstream_close (istr);
5175       Lstream_close (ostr);
5176       UNGCPRO;
5177       Lstream_delete (istr);
5178       Lstream_delete (ostr);
5179       Lstream_delete (XLSTREAM (da_outstream));
5180     }
5181
5182   *len_out = Dynarr_length (conversion_out_dynarr);
5183   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5184   return Dynarr_atp (conversion_out_dynarr, 0);
5185 }
5186
5187 Bufbyte *
5188 convert_from_external_format (CONST Extbyte *ptr,
5189                               Extcount len,
5190                               Bytecount *len_out,
5191                               enum external_data_format fmt)
5192 {
5193   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5194
5195   if (!conversion_in_dynarr)
5196     conversion_in_dynarr = Dynarr_new (Bufbyte);
5197   else
5198     Dynarr_reset (conversion_in_dynarr);
5199
5200   if (NILP (coding_system))
5201     {
5202       CONST Extbyte *end = ptr + len;
5203       for (; ptr < end; ptr++)
5204         {
5205           Extbyte c = *ptr;
5206           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5207         }
5208     }
5209   else
5210     {
5211       Lisp_Object instream, outstream, da_outstream;
5212       Lstream *istr, *ostr;
5213       struct gcpro gcpro1, gcpro2, gcpro3;
5214       char tempbuf[1024]; /* some random amount */
5215
5216       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5217       da_outstream = make_dynarr_output_stream
5218         ((unsigned_char_dynarr *) conversion_in_dynarr);
5219       outstream =
5220         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5221       istr = XLSTREAM (instream);
5222       ostr = XLSTREAM (outstream);
5223       GCPRO3 (instream, outstream, da_outstream);
5224       while (1)
5225         {
5226           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5227           if (!size_in_bytes)
5228             break;
5229           Lstream_write (ostr, tempbuf, size_in_bytes);
5230         }
5231       Lstream_close (istr);
5232       Lstream_close (ostr);
5233       UNGCPRO;
5234       Lstream_delete (istr);
5235       Lstream_delete (ostr);
5236       Lstream_delete (XLSTREAM (da_outstream));
5237     }
5238
5239   *len_out = Dynarr_length (conversion_in_dynarr);
5240   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5241   return Dynarr_atp (conversion_in_dynarr, 0);
5242 }
5243
5244 \f
5245 /************************************************************************/
5246 /*                             Initialization                           */
5247 /************************************************************************/
5248
5249 void
5250 syms_of_file_coding (void)
5251 {
5252   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5253   deferror (&Qcoding_system_error, "coding-system-error",
5254             "Coding-system error", Qio_error);
5255
5256   DEFSUBR (Fcoding_system_p);
5257   DEFSUBR (Ffind_coding_system);
5258   DEFSUBR (Fget_coding_system);
5259   DEFSUBR (Fcoding_system_list);
5260   DEFSUBR (Fcoding_system_name);
5261   DEFSUBR (Fmake_coding_system);
5262   DEFSUBR (Fcopy_coding_system);
5263   DEFSUBR (Fdefine_coding_system_alias);
5264   DEFSUBR (Fsubsidiary_coding_system);
5265
5266   DEFSUBR (Fcoding_system_type);
5267   DEFSUBR (Fcoding_system_doc_string);
5268 #ifdef MULE
5269   DEFSUBR (Fcoding_system_charset);
5270 #endif
5271   DEFSUBR (Fcoding_system_property);
5272
5273   DEFSUBR (Fcoding_category_list);
5274   DEFSUBR (Fset_coding_priority_list);
5275   DEFSUBR (Fcoding_priority_list);
5276   DEFSUBR (Fset_coding_category_system);
5277   DEFSUBR (Fcoding_category_system);
5278
5279   DEFSUBR (Fdetect_coding_region);
5280   DEFSUBR (Fdecode_coding_region);
5281   DEFSUBR (Fencode_coding_region);
5282 #ifdef MULE
5283   DEFSUBR (Fdecode_shift_jis_char);
5284   DEFSUBR (Fencode_shift_jis_char);
5285   DEFSUBR (Fdecode_big5_char);
5286   DEFSUBR (Fencode_big5_char);
5287 #endif /* MULE */
5288   defsymbol (&Qcoding_system_p, "coding-system-p");
5289   defsymbol (&Qno_conversion, "no-conversion");
5290   defsymbol (&Qraw_text, "raw-text");
5291 #ifdef MULE
5292   defsymbol (&Qbig5, "big5");
5293   defsymbol (&Qshift_jis, "shift-jis");
5294   defsymbol (&Qucs4, "ucs-4");
5295   defsymbol (&Qutf8, "utf-8");
5296   defsymbol (&Qccl, "ccl");
5297   defsymbol (&Qiso2022, "iso2022");
5298 #endif /* MULE */
5299   defsymbol (&Qmnemonic, "mnemonic");
5300   defsymbol (&Qeol_type, "eol-type");
5301   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5302   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5303
5304   defsymbol (&Qcr, "cr");
5305   defsymbol (&Qlf, "lf");
5306   defsymbol (&Qcrlf, "crlf");
5307   defsymbol (&Qeol_cr, "eol-cr");
5308   defsymbol (&Qeol_lf, "eol-lf");
5309   defsymbol (&Qeol_crlf, "eol-crlf");
5310 #ifdef MULE
5311   defsymbol (&Qcharset_g0, "charset-g0");
5312   defsymbol (&Qcharset_g1, "charset-g1");
5313   defsymbol (&Qcharset_g2, "charset-g2");
5314   defsymbol (&Qcharset_g3, "charset-g3");
5315   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5316   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5317   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5318   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5319   defsymbol (&Qno_iso6429, "no-iso6429");
5320   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5321   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5322
5323   defsymbol (&Qshort, "short");
5324   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5325   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5326   defsymbol (&Qseven, "seven");
5327   defsymbol (&Qlock_shift, "lock-shift");
5328   defsymbol (&Qescape_quoted, "escape-quoted");
5329 #endif /* MULE */
5330   defsymbol (&Qencode, "encode");
5331   defsymbol (&Qdecode, "decode");
5332
5333 #ifdef MULE
5334   defsymbol (&Qctext, "ctext");
5335   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5336              "shift-jis");
5337   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5338              "big5");
5339   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5340              "ucs-4");
5341   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5342              "utf-8");
5343   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5344              "iso-7");
5345   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5346              "iso-8-designate");
5347   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5348              "iso-8-1");
5349   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5350              "iso-8-2");
5351   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5352              "iso-lock-shift");
5353 #endif /* MULE */
5354   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5355              "no-conversion");
5356 }
5357
5358 void
5359 lstream_type_create_file_coding (void)
5360 {
5361   LSTREAM_HAS_METHOD (decoding, reader);
5362   LSTREAM_HAS_METHOD (decoding, writer);
5363   LSTREAM_HAS_METHOD (decoding, rewinder);
5364   LSTREAM_HAS_METHOD (decoding, seekable_p);
5365   LSTREAM_HAS_METHOD (decoding, flusher);
5366   LSTREAM_HAS_METHOD (decoding, closer);
5367   LSTREAM_HAS_METHOD (decoding, marker);
5368
5369   LSTREAM_HAS_METHOD (encoding, reader);
5370   LSTREAM_HAS_METHOD (encoding, writer);
5371   LSTREAM_HAS_METHOD (encoding, rewinder);
5372   LSTREAM_HAS_METHOD (encoding, seekable_p);
5373   LSTREAM_HAS_METHOD (encoding, flusher);
5374   LSTREAM_HAS_METHOD (encoding, closer);
5375   LSTREAM_HAS_METHOD (encoding, marker);
5376 }
5377
5378 void
5379 vars_of_file_coding (void)
5380 {
5381   int i;
5382
5383   /* Initialize to something reasonable ... */
5384   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5385     {
5386       coding_category_system[i] = Qnil;
5387       coding_category_by_priority[i] = i;
5388     }
5389
5390   Fprovide (intern ("file-coding"));
5391
5392   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5393 Coding system used for TTY keyboard input.
5394 Not used under a windowing system.
5395 */ );
5396   Vkeyboard_coding_system = Qnil;
5397
5398   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5399 Coding system used for TTY display output.
5400 Not used under a windowing system.
5401 */ );
5402   Vterminal_coding_system = Qnil;
5403
5404   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5405 Overriding coding system used when writing a file or process.
5406 You should *bind* this, not set it.  If this is non-nil, it specifies
5407 the coding system that will be used when a file or process is read
5408 in, and overrides `buffer-file-coding-system-for-read',
5409 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5410 this one for permanent changes to the environment.
5411 */ );
5412   Vcoding_system_for_read = Qnil;
5413
5414   DEFVAR_LISP ("coding-system-for-write",
5415                &Vcoding_system_for_write /*
5416 Overriding coding system used when writing a file or process.
5417 You should *bind* this, not set it.  If this is non-nil, it specifies
5418 the coding system that will be used when a file or process is wrote
5419 in, and overrides `buffer-file-coding-system',
5420 `write-region-pre-hook', etc.  Use those variables instead of this one
5421 for permanent changes to the environment.
5422 */ );
5423   Vcoding_system_for_write = Qnil;
5424
5425   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5426 Coding system used to convert pathnames when accessing files.
5427 */ );
5428   Vfile_name_coding_system = Qnil;
5429
5430   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5431 Non-nil means the buffer contents are regarded as multi-byte form
5432 of characters, not a binary code.  This affects the display, file I/O,
5433 and behaviors of various editing commands.
5434
5435 Setting this to nil does not do anything.
5436 */ );
5437   enable_multibyte_characters = 1;
5438 }
5439
5440 void
5441 complex_vars_of_file_coding (void)
5442 {
5443   staticpro (&Vcoding_system_hash_table);
5444   Vcoding_system_hash_table =
5445     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5446
5447   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5448
5449 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5450 {                                               \
5451   struct codesys_prop csp;                      \
5452   csp.sym = (Sym);                              \
5453   csp.prop_type = (Prop_Type);                  \
5454   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5455 } while (0)
5456
5457   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5458   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5459   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5460   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5461   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5462   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5463   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5464 #ifdef MULE
5465   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5466   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5467   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5468   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5469   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5470   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5471   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5472   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5473   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5474   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5475   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5476   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5477   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5478   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5479   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5480   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5481   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5482
5483   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5484   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5485 #endif /* MULE */
5486   /* Need to create this here or we're really screwed. */
5487   Fmake_coding_system
5488     (Qraw_text, Qno_conversion,
5489      build_string ("Raw text, which means it converts only line-break-codes."),
5490      list2 (Qmnemonic, build_string ("Raw")));
5491
5492   Fmake_coding_system
5493     (Qbinary, Qno_conversion,
5494      build_string ("Binary, which means it does not convert anything."),
5495      list4 (Qeol_type, Qlf,
5496             Qmnemonic, build_string ("Binary")));
5497
5498 #ifdef UTF2000
5499   Fmake_coding_system
5500     (Qutf8, Qutf8,
5501      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5502      list2 (Qmnemonic, build_string ("UTF8")));
5503 #endif
5504
5505   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5506
5507   /* Need this for bootstrapping */
5508   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5509     Fget_coding_system (Qraw_text);
5510
5511 #ifdef UTF2000
5512   coding_category_system[CODING_CATEGORY_UTF8]
5513    = Fget_coding_system (Qutf8);
5514 #endif
5515 }