src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #ifdef MULE
  34 #include "mule-ccl.h"
  35 #include "chartab.h"
  36 #endif
  37 #include "file-coding.h"
  38
  39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
  40
  41 Lisp_Object Vkeyboard_coding_system;
  42 Lisp_Object Vterminal_coding_system;
  43 Lisp_Object Vcoding_system_for_read;
  44 Lisp_Object Vcoding_system_for_write;
  45 Lisp_Object Vfile_name_coding_system;
  46
  47 /* Table of symbols identifying each coding category. */
  48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  49
  50 /* Coding system currently associated with each coding category. */
  51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  52
  53 /* Table of all coding categories in decreasing order of priority.
  54    This describes a permutation of the possible coding categories. */
  55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  56
  57 Lisp_Object Qcoding_system_p;
  58
  59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  60 /* Qinternal in general.c */
  61
  62 Lisp_Object Qmnemonic, Qeol_type;
  63 Lisp_Object Qcr, Qcrlf, Qlf;
  64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  65 Lisp_Object Qpost_read_conversion;
  66 Lisp_Object Qpre_write_conversion;
  67
  68 #ifdef MULE
  69 Lisp_Object Qucs4, Qutf8;
  70 Lisp_Object Qbig5, Qshift_jis;
  71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  74 Lisp_Object Qno_iso6429;
  75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  76 Lisp_Object Qctext, Qescape_quoted;
  77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  78 #endif
  79 Lisp_Object Qencode, Qdecode;
  80
  81 Lisp_Object Vcoding_system_hash_table;
  82
  83 int enable_multibyte_characters;
  84
  85 #ifdef MULE
  86 /* Additional information used by the ISO2022 decoder and detector. */
  87 struct iso2022_decoder
  88 {
  89   /* CHARSET holds the character sets currently assigned to the G0
  90      through G3 variables.  It is initialized from the array
  91      INITIAL_CHARSET in CODESYS. */
  92   Lisp_Object charset[4];
  93
  94   /* Which registers are currently invoked into the left (GL) and
  95      right (GR) halves of the 8-bit encoding space? */
  96   int register_left, register_right;
  97
  98   /* ISO_ESC holds a value indicating part of an escape sequence
  99      that has already been seen. */
 100   enum iso_esc_flag esc;
 101
 102   /* This records the bytes we've seen so far in an escape sequence,
 103      in case the sequence is invalid (we spit out the bytes unchanged). */
 104   unsigned char esc_bytes[8];
 105
 106   /* Index for next byte to store in ISO escape sequence. */
 107   int esc_bytes_index;
 108
 109 #ifdef ENABLE_COMPOSITE_CHARS
 110   /* Stuff seen so far when composing a string. */
 111   unsigned_char_dynarr *composite_chars;
 112 #endif
 113
 114   /* If we saw an invalid designation sequence for a particular
 115      register, we flag it here and switch to ASCII.  The next time we
 116      see a valid designation for this register, we turn off the flag
 117      and do the designation normally, but pretend the sequence was
 118      invalid.  The effect of all this is that (most of the time) the
 119      escape sequences for both the switch to the unknown charset, and
 120      the switch back to the known charset, get inserted literally into
 121      the buffer and saved out as such.  The hope is that we can
 122      preserve the escape sequences so that the resulting written out
 123      file makes sense.  If we don't do any of this, the designation
 124      to the invalid charset will be preserved but that switch back
 125      to the known charset will probably get eaten because it was
 126      the same charset that was already present in the register. */
 127   unsigned char invalid_designated[4];
 128
 129   /* We try to do similar things as above for direction-switching
 130      sequences.  If we encountered a direction switch while an
 131      invalid designation was present, or an invalid designation
 132      just after a direction switch (i.e. no valid designation
 133      encountered yet), we insert the direction-switch escape
 134      sequence literally into the output stream, and later on
 135      insert the corresponding direction-restoring escape sequence
 136      literally also. */
 137   unsigned int switched_dir_and_no_valid_charset_yet :1;
 138   unsigned int invalid_switch_dir :1;
 139
 140   /* Tells the decoder to output the escape sequence literally
 141      even though it was valid.  Used in the games we play to
 142      avoid lossage when we encounter invalid designations. */
 143   unsigned int output_literally :1;
 144   /* We encountered a direction switch followed by an invalid
 145      designation.  We didn't output the direction switch
 146      literally because we didn't know about the invalid designation;
 147      but we have to do so now. */
 148   unsigned int output_direction_sequence :1;
 149 };
 150 #endif /* MULE */
 151 EXFUN (Fcopy_coding_system, 2);
 152 #ifdef MULE
 153 struct detection_state;
 154 static int detect_coding_sjis (struct detection_state *st,
 155                                CONST unsigned char *src,
 156                                unsigned int n);
 157 static void decode_coding_sjis (Lstream *decoding,
 158                                 CONST unsigned char *src,
 159                                 unsigned_char_dynarr *dst,
 160                                 unsigned int n);
 161 static void encode_coding_sjis (Lstream *encoding,
 162                                 CONST unsigned char *src,
 163                                 unsigned_char_dynarr *dst,
 164                                 unsigned int n);
 165 static int detect_coding_big5 (struct detection_state *st,
 166                                CONST unsigned char *src,
 167                                unsigned int n);
 168 static void decode_coding_big5 (Lstream *decoding,
 169                                 CONST unsigned char *src,
 170                                 unsigned_char_dynarr *dst, unsigned int n);
 171 static void encode_coding_big5 (Lstream *encoding,
 172                                 CONST unsigned char *src,
 173                                 unsigned_char_dynarr *dst, unsigned int n);
 174 static int detect_coding_ucs4 (struct detection_state *st,
 175                                CONST unsigned char *src,
 176                                unsigned int n);
 177 static void decode_coding_ucs4 (Lstream *decoding,
 178                                 CONST unsigned char *src,
 179                                 unsigned_char_dynarr *dst, unsigned int n);
 180 static void encode_coding_ucs4 (Lstream *encoding,
 181                                 CONST unsigned char *src,
 182                                 unsigned_char_dynarr *dst, unsigned int n);
 183 static int detect_coding_utf8 (struct detection_state *st,
 184                                CONST unsigned char *src,
 185                                unsigned int n);
 186 static void decode_coding_utf8 (Lstream *decoding,
 187                                 CONST unsigned char *src,
 188                                 unsigned_char_dynarr *dst, unsigned int n);
 189 static void encode_coding_utf8 (Lstream *encoding,
 190                                 CONST unsigned char *src,
 191                                 unsigned_char_dynarr *dst, unsigned int n);
 192 static int postprocess_iso2022_mask (int mask);
 193 static void reset_iso2022 (Lisp_Object coding_system,
 194                            struct iso2022_decoder *iso);
 195 static int detect_coding_iso2022 (struct detection_state *st,
 196                                   CONST unsigned char *src,
 197                                   unsigned int n);
 198 static void decode_coding_iso2022 (Lstream *decoding,
 199                                    CONST unsigned char *src,
 200                                    unsigned_char_dynarr *dst, unsigned int n);
 201 static void encode_coding_iso2022 (Lstream *encoding,
 202                                    CONST unsigned char *src,
 203                                    unsigned_char_dynarr *dst, unsigned int n);
 204 #endif /* MULE */
 205 static void decode_coding_no_conversion (Lstream *decoding,
 206                                          CONST unsigned char *src,
 207                                          unsigned_char_dynarr *dst,
 208                                          unsigned int n);
 209 static void encode_coding_no_conversion (Lstream *encoding,
 210                                          CONST unsigned char *src,
 211                                          unsigned_char_dynarr *dst,
 212                                          unsigned int n);
 213 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 214                          unsigned_char_dynarr *dst, unsigned int n);
 215 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 216                          unsigned_char_dynarr *dst, unsigned int n);
 217
 218 typedef struct codesys_prop codesys_prop;
 219 struct codesys_prop
 220 {
 221   Lisp_Object sym;
 222   int prop_type;
 223 };
 224
 225 typedef struct
 226 {
 227   Dynarr_declare (codesys_prop);
 228 } codesys_prop_dynarr;
 229
 230 codesys_prop_dynarr *the_codesys_prop_dynarr;
 231
 232 enum codesys_prop_enum
 233 {
 234   CODESYS_PROP_ALL_OK,
 235   CODESYS_PROP_ISO2022,
 236   CODESYS_PROP_CCL
 237 };
 238
 239 \f
 240 /************************************************************************/
 241 /*                       Coding system functions                        */
 242 /************************************************************************/
 243
 244 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
 245 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 246 static void finalize_coding_system (void *header, int for_disksave);
 247
 248 #ifdef MULE
 249 static const struct lrecord_description ccs_description_1[] = {
 250   { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
 251   { XD_END }
 252 };
 253
 254 static const struct struct_description ccs_description = {
 255   sizeof(charset_conversion_spec),
 256   ccs_description_1
 257 };
 258
 259 static const struct lrecord_description ccsd_description_1[] = {
 260   XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 261   { XD_END }
 262 };
 263
 264 static const struct struct_description ccsd_description = {
 265   sizeof(charset_conversion_spec_dynarr),
 266   ccsd_description_1
 267 };
 268 #endif
 269
 270 static const struct lrecord_description coding_system_description[] = {
 271   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
 272   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
 273   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
 274 #ifdef MULE
 275   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
 276   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 277   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 278   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
 279 #endif
 280   { XD_END }
 281 };
 282
 283 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 284                                mark_coding_system, print_coding_system,
 285                                finalize_coding_system,
 286                                0, 0, coding_system_description,
 287                                struct Lisp_Coding_System);
 288
 289 static Lisp_Object
 290 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
 291 {
 292   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 293
 294   markobj (CODING_SYSTEM_NAME (codesys));
 295   markobj (CODING_SYSTEM_DOC_STRING (codesys));
 296   markobj (CODING_SYSTEM_MNEMONIC (codesys));
 297   markobj (CODING_SYSTEM_EOL_LF (codesys));
 298   markobj (CODING_SYSTEM_EOL_CRLF (codesys));
 299   markobj (CODING_SYSTEM_EOL_CR (codesys));
 300
 301   switch (CODING_SYSTEM_TYPE (codesys))
 302     {
 303 #ifdef MULE
 304       int i;
 305     case CODESYS_ISO2022:
 306       for (i = 0; i < 4; i++)
 307         markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 308       if (codesys->iso2022.input_conv)
 309         {
 310           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 311             {
 312               struct charset_conversion_spec *ccs =
 313                 Dynarr_atp (codesys->iso2022.input_conv, i);
 314               markobj (ccs->from_charset);
 315               markobj (ccs->to_charset);
 316             }
 317         }
 318       if (codesys->iso2022.output_conv)
 319         {
 320           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 321             {
 322               struct charset_conversion_spec *ccs =
 323                 Dynarr_atp (codesys->iso2022.output_conv, i);
 324               markobj (ccs->from_charset);
 325               markobj (ccs->to_charset);
 326             }
 327         }
 328       break;
 329
 330     case CODESYS_CCL:
 331       markobj (CODING_SYSTEM_CCL_DECODE (codesys));
 332       markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
 333       break;
 334 #endif /* MULE */
 335     default:
 336       break;
 337     }
 338
 339   markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 340   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 341 }
 342
 343 static void
 344 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 345                      int escapeflag)
 346 {
 347   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 348   if (print_readably)
 349     error ("printing unreadable object #<coding_system 0x%x>",
 350            c->header.uid);
 351
 352   write_c_string ("#<coding_system ", printcharfun);
 353   print_internal (c->name, printcharfun, 1);
 354   write_c_string (">", printcharfun);
 355 }
 356
 357 static void
 358 finalize_coding_system (void *header, int for_disksave)
 359 {
 360   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 361   /* Since coding systems never go away, this function is not
 362      necessary.  But it would be necessary if we changed things
 363      so that coding systems could go away. */
 364   if (!for_disksave) /* see comment in lstream.c */
 365     {
 366       switch (CODING_SYSTEM_TYPE (c))
 367         {
 368 #ifdef MULE
 369         case CODESYS_ISO2022:
 370           if (c->iso2022.input_conv)
 371             {
 372               Dynarr_free (c->iso2022.input_conv);
 373               c->iso2022.input_conv = 0;
 374             }
 375           if (c->iso2022.output_conv)
 376             {
 377               Dynarr_free (c->iso2022.output_conv);
 378               c->iso2022.output_conv = 0;
 379             }
 380           break;
 381 #endif /* MULE */
 382         default:
 383           break;
 384         }
 385     }
 386 }
 387
 388 static enum eol_type
 389 symbol_to_eol_type (Lisp_Object symbol)
 390 {
 391   CHECK_SYMBOL (symbol);
 392   if (NILP (symbol))      return EOL_AUTODETECT;
 393   if (EQ (symbol, Qlf))   return EOL_LF;
 394   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 395   if (EQ (symbol, Qcr))   return EOL_CR;
 396
 397   signal_simple_error ("Unrecognized eol type", symbol);
 398   return EOL_AUTODETECT; /* not reached */
 399 }
 400
 401 static Lisp_Object
 402 eol_type_to_symbol (enum eol_type type)
 403 {
 404   switch (type)
 405     {
 406     default: abort ();
 407     case EOL_LF:         return Qlf;
 408     case EOL_CRLF:       return Qcrlf;
 409     case EOL_CR:         return Qcr;
 410     case EOL_AUTODETECT: return Qnil;
 411     }
 412 }
 413
 414 static void
 415 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 416 {
 417   Lisp_Object codesys_obj;
 418   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 419   char *codesys_name = (char *) alloca (len + 7);
 420   int mlen = -1;
 421   char *codesys_mnemonic=0;
 422
 423   Lisp_Object codesys_name_sym, sub_codesys_obj;
 424
 425   /* kludge */
 426
 427   XSETCODING_SYSTEM (codesys_obj, codesys);
 428
 429   memcpy (codesys_name,
 430           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 431
 432   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 433     {
 434       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 435       codesys_mnemonic = (char *) alloca (mlen + 7);
 436       memcpy (codesys_mnemonic,
 437               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 438     }
 439
 440 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 441   strcpy (codesys_name + len, "-" op_sys);                                      \
 442   if (mlen != -1)                                                               \
 443     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 444   codesys_name_sym = intern (codesys_name);                                     \
 445   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 446   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 447   if (mlen != -1)                                                               \
 448     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 449       build_string (codesys_mnemonic);                                          \
 450   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 451 } while (0)
 452
 453   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 454   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 455   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 456 }
 457
 458 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 459 Return t if OBJECT is a coding system.
 460 A coding system is an object that defines how text containing multiple
 461 character sets is encoded into a stream of (typically 8-bit) bytes.
 462 The coding system is used to decode the stream into a series of
 463 characters (which may be from multiple charsets) when the text is read
 464 from a file or process, and is used to encode the text back into the
 465 same format when it is written out to a file or process.
 466
 467 For example, many ISO2022-compliant coding systems (such as Compound
 468 Text, which is used for inter-client data under the X Window System)
 469 use escape sequences to switch between different charsets -- Japanese
 470 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 471 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 472 `make-coding-system' for more information.
 473
 474 Coding systems are normally identified using a symbol, and the
 475 symbol is accepted in place of the actual coding system object whenever
 476 a coding system is called for. (This is similar to how faces work.)
 477 */
 478        (object))
 479 {
 480   return CODING_SYSTEMP (object) ? Qt : Qnil;
 481 }
 482
 483 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 484 Retrieve the coding system of the given name.
 485
 486 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 487 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 488 If there is no such coding system, nil is returned.  Otherwise the
 489 associated coding system object is returned.
 490 */
 491        (coding_system_or_name))
 492 {
 493   if (CODING_SYSTEMP (coding_system_or_name))
 494     return coding_system_or_name;
 495
 496   if (NILP (coding_system_or_name))
 497     coding_system_or_name = Qbinary;
 498   else
 499     CHECK_SYMBOL (coding_system_or_name);
 500
 501   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 502 }
 503
 504 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 505 Retrieve the coding system of the given name.
 506 Same as `find-coding-system' except that if there is no such
 507 coding system, an error is signaled instead of returning nil.
 508 */
 509        (name))
 510 {
 511   Lisp_Object coding_system = Ffind_coding_system (name);
 512
 513   if (NILP (coding_system))
 514     signal_simple_error ("No such coding system", name);
 515   return coding_system;
 516 }
 517
 518 /* We store the coding systems in hash tables with the names as the key and the
 519    actual coding system object as the value.  Occasionally we need to use them
 520    in a list format.  These routines provide us with that. */
 521 struct coding_system_list_closure
 522 {
 523   Lisp_Object *coding_system_list;
 524 };
 525
 526 static int
 527 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 528                                   void *coding_system_list_closure)
 529 {
 530   /* This function can GC */
 531   struct coding_system_list_closure *cscl =
 532     (struct coding_system_list_closure *) coding_system_list_closure;
 533   Lisp_Object *coding_system_list = cscl->coding_system_list;
 534
 535   *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
 536                                *coding_system_list);
 537   return 0;
 538 }
 539
 540 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 541 Return a list of the names of all defined coding systems.
 542 */
 543        ())
 544 {
 545   Lisp_Object coding_system_list = Qnil;
 546   struct gcpro gcpro1;
 547   struct coding_system_list_closure coding_system_list_closure;
 548
 549   GCPRO1 (coding_system_list);
 550   coding_system_list_closure.coding_system_list = &coding_system_list;
 551   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 552                  &coding_system_list_closure);
 553   UNGCPRO;
 554
 555   return coding_system_list;
 556 }
 557
 558 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 559 Return the name of the given coding system.
 560 */
 561        (coding_system))
 562 {
 563   coding_system = Fget_coding_system (coding_system);
 564   return XCODING_SYSTEM_NAME (coding_system);
 565 }
 566
 567 static Lisp_Coding_System *
 568 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 569 {
 570   Lisp_Coding_System *codesys =
 571     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 572
 573   zero_lcrecord (codesys);
 574   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 575   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 576   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 577   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 578   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 579   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 580   CODING_SYSTEM_TYPE     (codesys) = type;
 581   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 582 #ifdef MULE
 583   if (type == CODESYS_ISO2022)
 584     {
 585       int i;
 586       for (i = 0; i < 4; i++)
 587         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 588     }
 589   else if (type == CODESYS_CCL)
 590     {
 591       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 592       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 593     }
 594 #endif /* MULE */
 595   CODING_SYSTEM_NAME (codesys) = name;
 596
 597   return codesys;
 598 }
 599
 600 #ifdef MULE
 601 /* Given a list of charset conversion specs as specified in a Lisp
 602    program, parse it into STORE_HERE. */
 603
 604 static void
 605 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 606                                 Lisp_Object spec_list)
 607 {
 608   Lisp_Object rest;
 609
 610   EXTERNAL_LIST_LOOP (rest, spec_list)
 611     {
 612       Lisp_Object car = XCAR (rest);
 613       Lisp_Object from, to;
 614       struct charset_conversion_spec spec;
 615
 616       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 617         signal_simple_error ("Invalid charset conversion spec", car);
 618       from = Fget_charset (XCAR (car));
 619       to = Fget_charset (XCAR (XCDR (car)));
 620       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 621         signal_simple_error_2
 622           ("Attempted conversion between different charset types",
 623            from, to);
 624       spec.from_charset = from;
 625       spec.to_charset = to;
 626
 627       Dynarr_add (store_here, spec);
 628     }
 629 }
 630
 631 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 632    specs, return the equivalent as the Lisp programmer would see it.
 633
 634    If LOAD_HERE is 0, return Qnil. */
 635
 636 static Lisp_Object
 637 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 638 {
 639   int i;
 640   Lisp_Object result;
 641
 642   if (!load_here)
 643     return Qnil;
 644   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 645     {
 646       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 647       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 648     }
 649
 650   return Fnreverse (result);
 651 }
 652
 653 #endif /* MULE */
 654
 655 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 656 Register symbol NAME as a coding system.
 657
 658 TYPE describes the conversion method used and should be one of
 659
 660 nil or 'undecided
 661      Automatic conversion.  XEmacs attempts to detect the coding system
 662      used in the file.
 663 'no-conversion
 664      No conversion.  Use this for binary files and such.  On output,
 665      graphic characters that are not in ASCII or Latin-1 will be
 666      replaced by a ?. (For a no-conversion-encoded buffer, these
 667      characters will only be present if you explicitly insert them.)
 668 'shift-jis
 669      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 670 'ucs-4
 671      ISO 10646 UCS-4 encoding.
 672 'utf-8
 673      ISO 10646 UTF-8 encoding.
 674 'iso2022
 675      Any ISO2022-compliant encoding.  Among other things, this includes
 676      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 677      standard Unix encoding for Japanese and other languages), and
 678      Compound Text (the encoding used in X11).  You can specify more
 679      specific information about the conversion with the FLAGS argument.
 680 'big5
 681      Big5 (the encoding commonly used for Taiwanese).
 682 'ccl
 683      The conversion is performed using a user-written pseudo-code
 684      program.  CCL (Code Conversion Language) is the name of this
 685      pseudo-code.
 686 'internal
 687      Write out or read in the raw contents of the memory representing
 688      the buffer's text.  This is primarily useful for debugging
 689      purposes, and is only enabled when XEmacs has been compiled with
 690      DEBUG_XEMACS defined (via the --debug configure option).
 691      WARNING: Reading in a file using 'internal conversion can result
 692      in an internal inconsistency in the memory representing a
 693      buffer's text, which will produce unpredictable results and may
 694      cause XEmacs to crash.  Under normal circumstances you should
 695      never use 'internal conversion.
 696
 697 DOC-STRING is a string describing the coding system.
 698
 699 PROPS is a property list, describing the specific nature of the
 700 character set.  Recognized properties are:
 701
 702 'mnemonic
 703      String to be displayed in the modeline when this coding system is
 704      active.
 705
 706 'eol-type
 707      End-of-line conversion to be used.  It should be one of
 708
 709         nil
 710                 Automatically detect the end-of-line type (LF, CRLF,
 711                 or CR).  Also generate subsidiary coding systems named
 712                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 713                 identical to this coding system but have an EOL-TYPE
 714                 value of 'lf, 'crlf, and 'cr, respectively.
 715         'lf
 716                 The end of a line is marked externally using ASCII LF.
 717                 Since this is also the way that XEmacs represents an
 718                 end-of-line internally, specifying this option results
 719                 in no end-of-line conversion.  This is the standard
 720                 format for Unix text files.
 721         'crlf
 722                 The end of a line is marked externally using ASCII
 723                 CRLF.  This is the standard format for MS-DOS text
 724                 files.
 725         'cr
 726                 The end of a line is marked externally using ASCII CR.
 727                 This is the standard format for Macintosh text files.
 728         t
 729                 Automatically detect the end-of-line type but do not
 730                 generate subsidiary coding systems.  (This value is
 731                 converted to nil when stored internally, and
 732                 `coding-system-property' will return nil.)
 733
 734 'post-read-conversion
 735      Function called after a file has been read in, to perform the
 736      decoding.  Called with two arguments, BEG and END, denoting
 737      a region of the current buffer to be decoded.
 738
 739 'pre-write-conversion
 740      Function called before a file is written out, to perform the
 741      encoding.  Called with two arguments, BEG and END, denoting
 742      a region of the current buffer to be encoded.
 743
 744
 745 The following additional properties are recognized if TYPE is 'iso2022:
 746
 747 'charset-g0
 748 'charset-g1
 749 'charset-g2
 750 'charset-g3
 751      The character set initially designated to the G0 - G3 registers.
 752      The value should be one of
 753
 754           -- A charset object (designate that character set)
 755           -- nil (do not ever use this register)
 756           -- t (no character set is initially designated to
 757                 the register, but may be later on; this automatically
 758                 sets the corresponding `force-g*-on-output' property)
 759
 760 'force-g0-on-output
 761 'force-g1-on-output
 762 'force-g2-on-output
 763 'force-g2-on-output
 764      If non-nil, send an explicit designation sequence on output before
 765      using the specified register.
 766
 767 'short
 768      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 769      "ESC $ B" on output in place of the full designation sequences
 770      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 771
 772 'no-ascii-eol
 773      If non-nil, don't designate ASCII to G0 at each end of line on output.
 774      Setting this to non-nil also suppresses other state-resetting that
 775      normally happens at the end of a line.
 776
 777 'no-ascii-cntl
 778      If non-nil, don't designate ASCII to G0 before control chars on output.
 779
 780 'seven
 781      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 782      environment.
 783
 784 'lock-shift
 785      If non-nil, use locking-shift (SO/SI) instead of single-shift
 786      or designation by escape sequence.
 787
 788 'no-iso6429
 789      If non-nil, don't use ISO6429's direction specification.
 790
 791 'escape-quoted
 792      If non-nil, literal control characters that are the same as
 793      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 794      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 795      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 796      so that they can be properly distinguished from an escape sequence.
 797      (Note that doing this results in a non-portable encoding.) This
 798      encoding flag is used for byte-compiled files.  Note that ESC
 799      is a good choice for a quoting character because there are no
 800      escape sequences whose second byte is a character from the Control-0
 801      or Control-1 character sets; this is explicitly disallowed by the
 802      ISO2022 standard.
 803
 804 'input-charset-conversion
 805      A list of conversion specifications, specifying conversion of
 806      characters in one charset to another when decoding is performed.
 807      Each specification is a list of two elements: the source charset,
 808      and the destination charset.
 809
 810 'output-charset-conversion
 811      A list of conversion specifications, specifying conversion of
 812      characters in one charset to another when encoding is performed.
 813      The form of each specification is the same as for
 814      'input-charset-conversion.
 815
 816
 817 The following additional properties are recognized (and required)
 818 if TYPE is 'ccl:
 819
 820 'decode
 821      CCL program used for decoding (converting to internal format).
 822
 823 'encode
 824      CCL program used for encoding (converting to external format).
 825 */
 826        (name, type, doc_string, props))
 827 {
 828   Lisp_Coding_System *codesys;
 829   Lisp_Object rest, key, value;
 830   enum coding_system_type ty;
 831   int need_to_setup_eol_systems = 1;
 832
 833   /* Convert type to constant */
 834   if (NILP (type) || EQ (type, Qundecided))
 835                                       { ty = CODESYS_AUTODETECT; }
 836 #ifdef MULE
 837   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 838   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 839   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 840   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 841   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 842   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 843 #endif
 844   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 845 #ifdef DEBUG_XEMACS
 846   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 847 #endif
 848   else
 849     signal_simple_error ("Invalid coding system type", type);
 850
 851   CHECK_SYMBOL (name);
 852
 853   codesys = allocate_coding_system (ty, name);
 854
 855   if (NILP (doc_string))
 856     doc_string = build_string ("");
 857   else
 858     CHECK_STRING (doc_string);
 859   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 860
 861   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 862     {
 863       if (EQ (key, Qmnemonic))
 864         {
 865           if (!NILP (value))
 866             CHECK_STRING (value);
 867           CODING_SYSTEM_MNEMONIC (codesys) = value;
 868         }
 869
 870       else if (EQ (key, Qeol_type))
 871         {
 872           need_to_setup_eol_systems = NILP (value);
 873           if (EQ (value, Qt))
 874             value = Qnil;
 875           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 876         }
 877
 878       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 879       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 880 #ifdef MULE
 881       else if (ty == CODESYS_ISO2022)
 882         {
 883 #define FROB_INITIAL_CHARSET(charset_num) \
 884   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 885     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 886
 887           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 888           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 889           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 890           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 891
 892 #define FROB_FORCE_CHARSET(charset_num) \
 893   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 894
 895           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 896           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 897           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 898           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 899
 900 #define FROB_BOOLEAN_PROPERTY(prop) \
 901   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 902
 903           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 904           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 905           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 906           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 907           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 908           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 909           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 910
 911           else if (EQ (key, Qinput_charset_conversion))
 912             {
 913               codesys->iso2022.input_conv =
 914                 Dynarr_new (charset_conversion_spec);
 915               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 916                                               value);
 917             }
 918           else if (EQ (key, Qoutput_charset_conversion))
 919             {
 920               codesys->iso2022.output_conv =
 921                 Dynarr_new (charset_conversion_spec);
 922               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 923                                               value);
 924             }
 925           else
 926             signal_simple_error ("Unrecognized property", key);
 927         }
 928       else if (EQ (type, Qccl))
 929         {
 930           if (EQ (key, Qdecode))
 931             {
 932               CHECK_VECTOR (value);
 933               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 934             }
 935           else if (EQ (key, Qencode))
 936             {
 937               CHECK_VECTOR (value);
 938               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 939             }
 940           else
 941             signal_simple_error ("Unrecognized property", key);
 942         }
 943 #endif /* MULE */
 944       else
 945         signal_simple_error ("Unrecognized property", key);
 946     }
 947
 948   if (need_to_setup_eol_systems)
 949     setup_eol_coding_systems (codesys);
 950
 951   {
 952     Lisp_Object codesys_obj;
 953     XSETCODING_SYSTEM (codesys_obj, codesys);
 954     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 955     return codesys_obj;
 956   }
 957 }
 958
 959 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 960 Copy OLD-CODING-SYSTEM to NEW-NAME.
 961 If NEW-NAME does not name an existing coding system, a new one will
 962 be created.
 963 */
 964        (old_coding_system, new_name))
 965 {
 966   Lisp_Object new_coding_system;
 967   old_coding_system = Fget_coding_system (old_coding_system);
 968   new_coding_system = Ffind_coding_system (new_name);
 969   if (NILP (new_coding_system))
 970     {
 971       XSETCODING_SYSTEM (new_coding_system,
 972                          allocate_coding_system
 973                          (XCODING_SYSTEM_TYPE (old_coding_system),
 974                           new_name));
 975       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
 976     }
 977
 978   {
 979     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
 980     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
 981     memcpy (((char *) to  ) + sizeof (to->header),
 982             ((char *) from) + sizeof (from->header),
 983             sizeof (*from) - sizeof (from->header));
 984     to->name = new_name;
 985   }
 986   return new_coding_system;
 987 }
 988
 989 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
 990 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
 991 */
 992        (alias, coding_system))
 993 {
 994   CHECK_SYMBOL (alias);
 995   if (!NILP (Ffind_coding_system (alias)))
 996     signal_simple_error ("Symbol already names a coding system", alias);
 997   coding_system = Fget_coding_system (coding_system);
 998   Fputhash (alias, coding_system, Vcoding_system_hash_table);
 999
1000   /* Set up aliases for subsidiaries. */
1001   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1002     {
1003       Lisp_Object str;
1004       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1005 #define FROB(type, name)                                                        \
1006       do {                                                                      \
1007         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
1008         if (!NILP (subsidiary))                                                 \
1009           Fdefine_coding_system_alias                                           \
1010             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
1011       } while (0)
1012       FROB (LF,   "-unix");
1013       FROB (CRLF, "-dos");
1014       FROB (CR,   "-mac");
1015 #undef FROB
1016     }
1017   /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1018      but it doesn't look intentional, so I'd rather return something
1019      meaningful or nothing at all. */
1020   return Qnil;
1021 }
1022
1023 static Lisp_Object
1024 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1025 {
1026   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1027   Lisp_Object new_coding_system;
1028
1029   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1030     return coding_system;
1031
1032   switch (type)
1033     {
1034     case EOL_AUTODETECT: return coding_system;
1035     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1036     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1037     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1038     default:       abort ();
1039     }
1040
1041   return NILP (new_coding_system) ? coding_system : new_coding_system;
1042 }
1043
1044 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1045 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1046 */
1047        (coding_system, eol_type))
1048 {
1049   coding_system = Fget_coding_system (coding_system);
1050
1051   return subsidiary_coding_system (coding_system,
1052                                    symbol_to_eol_type (eol_type));
1053 }
1054
1055 \f
1056 /************************************************************************/
1057 /*                         Coding system accessors                      */
1058 /************************************************************************/
1059
1060 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1061 Return the doc string for CODING-SYSTEM.
1062 */
1063        (coding_system))
1064 {
1065   coding_system = Fget_coding_system (coding_system);
1066   return XCODING_SYSTEM_DOC_STRING (coding_system);
1067 }
1068
1069 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1070 Return the type of CODING-SYSTEM.
1071 */
1072        (coding_system))
1073 {
1074   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1075     {
1076     default: abort ();
1077     case CODESYS_AUTODETECT:    return Qundecided;
1078 #ifdef MULE
1079     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1080     case CODESYS_ISO2022:       return Qiso2022;
1081     case CODESYS_BIG5:          return Qbig5;
1082     case CODESYS_UCS4:          return Qucs4;
1083     case CODESYS_UTF8:          return Qutf8;
1084     case CODESYS_CCL:           return Qccl;
1085 #endif
1086     case CODESYS_NO_CONVERSION: return Qno_conversion;
1087 #ifdef DEBUG_XEMACS
1088     case CODESYS_INTERNAL:      return Qinternal;
1089 #endif
1090     }
1091 }
1092
1093 #ifdef MULE
1094 static
1095 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1096 {
1097   Lisp_Object cs
1098     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1099
1100   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1101 }
1102
1103 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1104 Return initial charset of CODING-SYSTEM designated to GNUM.
1105 GNUM allows 0 .. 3.
1106 */
1107        (coding_system, gnum))
1108 {
1109   coding_system = Fget_coding_system (coding_system);
1110   CHECK_INT (gnum);
1111
1112   return coding_system_charset (coding_system, XINT (gnum));
1113 }
1114 #endif /* MULE */
1115
1116 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1117 Return the PROP property of CODING-SYSTEM.
1118 */
1119        (coding_system, prop))
1120 {
1121   int i, ok = 0;
1122   enum coding_system_type type;
1123
1124   coding_system = Fget_coding_system (coding_system);
1125   CHECK_SYMBOL (prop);
1126   type = XCODING_SYSTEM_TYPE (coding_system);
1127
1128   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1129     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1130       {
1131         ok = 1;
1132         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1133           {
1134           case CODESYS_PROP_ALL_OK:
1135             break;
1136 #ifdef MULE
1137           case CODESYS_PROP_ISO2022:
1138             if (type != CODESYS_ISO2022)
1139               signal_simple_error
1140                 ("Property only valid in ISO2022 coding systems",
1141                  prop);
1142             break;
1143
1144           case CODESYS_PROP_CCL:
1145             if (type != CODESYS_CCL)
1146               signal_simple_error
1147                 ("Property only valid in CCL coding systems",
1148                  prop);
1149             break;
1150 #endif /* MULE */
1151           default:
1152             abort ();
1153           }
1154       }
1155
1156   if (!ok)
1157     signal_simple_error ("Unrecognized property", prop);
1158
1159   if (EQ (prop, Qname))
1160     return XCODING_SYSTEM_NAME (coding_system);
1161   else if (EQ (prop, Qtype))
1162     return Fcoding_system_type (coding_system);
1163   else if (EQ (prop, Qdoc_string))
1164     return XCODING_SYSTEM_DOC_STRING (coding_system);
1165   else if (EQ (prop, Qmnemonic))
1166     return XCODING_SYSTEM_MNEMONIC (coding_system);
1167   else if (EQ (prop, Qeol_type))
1168     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1169   else if (EQ (prop, Qeol_lf))
1170     return XCODING_SYSTEM_EOL_LF (coding_system);
1171   else if (EQ (prop, Qeol_crlf))
1172     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1173   else if (EQ (prop, Qeol_cr))
1174     return XCODING_SYSTEM_EOL_CR (coding_system);
1175   else if (EQ (prop, Qpost_read_conversion))
1176     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1177   else if (EQ (prop, Qpre_write_conversion))
1178     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1179 #ifdef MULE
1180   else if (type == CODESYS_ISO2022)
1181     {
1182       if (EQ (prop, Qcharset_g0))
1183         return coding_system_charset (coding_system, 0);
1184       else if (EQ (prop, Qcharset_g1))
1185         return coding_system_charset (coding_system, 1);
1186       else if (EQ (prop, Qcharset_g2))
1187         return coding_system_charset (coding_system, 2);
1188       else if (EQ (prop, Qcharset_g3))
1189         return coding_system_charset (coding_system, 3);
1190
1191 #define FORCE_CHARSET(charset_num) \
1192   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1193    (coding_system, charset_num) ? Qt : Qnil)
1194
1195       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1196       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1197       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1198       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1199
1200 #define LISP_BOOLEAN(prop) \
1201   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1202
1203       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1204       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1205       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1206       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1207       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1208       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1209       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1210
1211       else if (EQ (prop, Qinput_charset_conversion))
1212         return
1213           unparse_charset_conversion_specs
1214             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1215       else if (EQ (prop, Qoutput_charset_conversion))
1216         return
1217           unparse_charset_conversion_specs
1218             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1219       else
1220         abort ();
1221     }
1222   else if (type == CODESYS_CCL)
1223     {
1224       if (EQ (prop, Qdecode))
1225         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1226       else if (EQ (prop, Qencode))
1227         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1228       else
1229         abort ();
1230     }
1231 #endif /* MULE */
1232   else
1233     abort ();
1234
1235   return Qnil; /* not reached */
1236 }
1237
1238 \f
1239 /************************************************************************/
1240 /*                       Coding category functions                      */
1241 /************************************************************************/
1242
1243 static int
1244 decode_coding_category (Lisp_Object symbol)
1245 {
1246   int i;
1247
1248   CHECK_SYMBOL (symbol);
1249   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1250     if (EQ (coding_category_symbol[i], symbol))
1251       return i;
1252
1253   signal_simple_error ("Unrecognized coding category", symbol);
1254   return 0; /* not reached */
1255 }
1256
1257 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1258 Return a list of all recognized coding categories.
1259 */
1260        ())
1261 {
1262   int i;
1263   Lisp_Object list = Qnil;
1264
1265   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1266     list = Fcons (coding_category_symbol[i], list);
1267   return list;
1268 }
1269
1270 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1271 Change the priority order of the coding categories.
1272 LIST should be list of coding categories, in descending order of
1273 priority.  Unspecified coding categories will be lower in priority
1274 than all specified ones, in the same relative order they were in
1275 previously.
1276 */
1277        (list))
1278 {
1279   int category_to_priority[CODING_CATEGORY_LAST + 1];
1280   int i, j;
1281   Lisp_Object rest;
1282
1283   /* First generate a list that maps coding categories to priorities. */
1284
1285   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1286     category_to_priority[i] = -1;
1287
1288   /* Highest priority comes from the specified list. */
1289   i = 0;
1290   EXTERNAL_LIST_LOOP (rest, list)
1291     {
1292       int cat = decode_coding_category (XCAR (rest));
1293
1294       if (category_to_priority[cat] >= 0)
1295         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1296       category_to_priority[cat] = i++;
1297     }
1298
1299   /* Now go through the existing categories by priority to retrieve
1300      the categories not yet specified and preserve their priority
1301      order. */
1302   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1303     {
1304       int cat = coding_category_by_priority[j];
1305       if (category_to_priority[cat] < 0)
1306         category_to_priority[cat] = i++;
1307     }
1308
1309   /* Now we need to construct the inverse of the mapping we just
1310      constructed. */
1311
1312   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1313     coding_category_by_priority[category_to_priority[i]] = i;
1314
1315   /* Phew!  That was confusing. */
1316   return Qnil;
1317 }
1318
1319 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1320 Return a list of coding categories in descending order of priority.
1321 */
1322        ())
1323 {
1324   int i;
1325   Lisp_Object list = Qnil;
1326
1327   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1328     list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1329                   list);
1330   return list;
1331 }
1332
1333 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1334 Change the coding system associated with a coding category.
1335 */
1336        (coding_category, coding_system))
1337 {
1338   int cat = decode_coding_category (coding_category);
1339
1340   coding_system = Fget_coding_system (coding_system);
1341   coding_category_system[cat] = coding_system;
1342   return Qnil;
1343 }
1344
1345 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1346 Return the coding system associated with a coding category.
1347 */
1348        (coding_category))
1349 {
1350   int cat = decode_coding_category (coding_category);
1351   Lisp_Object sys = coding_category_system[cat];
1352
1353   if (!NILP (sys))
1354     return XCODING_SYSTEM_NAME (sys);
1355   return Qnil;
1356 }
1357
1358 \f
1359 /************************************************************************/
1360 /*                     Detecting the encoding of data                   */
1361 /************************************************************************/
1362
1363 struct detection_state
1364 {
1365   enum eol_type eol_type;
1366   int seen_non_ascii;
1367   int mask;
1368 #ifdef MULE
1369   struct
1370     {
1371       int mask;
1372       int in_second_byte;
1373     }
1374   big5;
1375
1376   struct
1377     {
1378       int mask;
1379       int in_second_byte;
1380     }
1381   shift_jis;
1382
1383   struct
1384     {
1385       int mask;
1386       int in_byte;
1387   }
1388   ucs4;
1389
1390   struct
1391     {
1392       int mask;
1393       int in_byte;
1394     }
1395   utf8;
1396
1397   struct
1398     {
1399       int mask;
1400       int initted;
1401       struct iso2022_decoder iso;
1402       unsigned int flags;
1403       int high_byte_count;
1404       unsigned int saw_single_shift:1;
1405     }
1406   iso2022;
1407 #endif
1408   struct
1409     {
1410       int seen_anything;
1411       int just_saw_cr;
1412     }
1413   eol;
1414 };
1415
1416 static int
1417 acceptable_control_char_p (int c)
1418 {
1419   switch (c)
1420     {
1421       /* Allow and ignore control characters that you might
1422          reasonably see in a text file */
1423     case '\r':
1424     case '\n':
1425     case '\t':
1426     case  7: /* bell */
1427     case  8: /* backspace */
1428     case 11: /* vertical tab */
1429     case 12: /* form feed */
1430     case 26: /* MS-DOS C-z junk */
1431     case 31: /* '^_' -- for info */
1432       return 1;
1433     default:
1434       return 0;
1435     }
1436 }
1437
1438 static int
1439 mask_has_at_most_one_bit_p (int mask)
1440 {
1441   /* Perhaps the only thing useful you learn from intensive Microsoft
1442      technical interviews */
1443   return (mask & (mask - 1)) == 0;
1444 }
1445
1446 static enum eol_type
1447 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1448                  unsigned int n)
1449 {
1450   int c;
1451
1452   while (n--)
1453     {
1454       c = *src++;
1455       if (c == '\r')
1456         st->eol.just_saw_cr = 1;
1457       else
1458         {
1459           if (c == '\n')
1460             {
1461               if (st->eol.just_saw_cr)
1462                 return EOL_CRLF;
1463               else if (st->eol.seen_anything)
1464                 return EOL_LF;
1465             }
1466           else if (st->eol.just_saw_cr)
1467             return EOL_CR;
1468           st->eol.just_saw_cr = 0;
1469         }
1470       st->eol.seen_anything = 1;
1471     }
1472
1473   return EOL_AUTODETECT;
1474 }
1475
1476 /* Attempt to determine the encoding and EOL type of the given text.
1477    Before calling this function for the first type, you must initialize
1478    st->eol_type as appropriate and initialize st->mask to ~0.
1479
1480    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1481    not yet known.
1482
1483    st->mask holds the determined coding category mask, or ~0 if only
1484    ASCII has been seen so far.
1485
1486    Returns:
1487
1488    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1489         is present in st->mask
1490    1 == definitive answers are here for both st->eol_type and st->mask
1491 */
1492
1493 static int
1494 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1495                     unsigned int n, int just_do_eol)
1496 {
1497   int c;
1498
1499   if (st->eol_type == EOL_AUTODETECT)
1500     st->eol_type = detect_eol_type (st, src, n);
1501
1502   if (just_do_eol)
1503     return st->eol_type != EOL_AUTODETECT;
1504
1505   if (!st->seen_non_ascii)
1506     {
1507       for (; n; n--, src++)
1508         {
1509           c = *src;
1510           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1511             {
1512               st->seen_non_ascii = 1;
1513 #ifdef MULE
1514               st->shift_jis.mask = ~0;
1515               st->big5.mask = ~0;
1516               st->ucs4.mask = ~0;
1517               st->utf8.mask = ~0;
1518               st->iso2022.mask = ~0;
1519 #endif
1520               break;
1521             }
1522         }
1523     }
1524
1525   if (!n)
1526     return 0;
1527 #ifdef MULE
1528   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1529     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1530   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1531     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1532   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1533     st->big5.mask = detect_coding_big5 (st, src, n);
1534   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1535     st->utf8.mask = detect_coding_utf8 (st, src, n);
1536   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1537     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1538
1539   st->mask
1540     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1541     | st->utf8.mask | st->ucs4.mask;
1542 #endif
1543   {
1544     int retval = mask_has_at_most_one_bit_p (st->mask);
1545     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1546     return retval && st->eol_type != EOL_AUTODETECT;
1547   }
1548 }
1549
1550 static Lisp_Object
1551 coding_system_from_mask (int mask)
1552 {
1553   if (mask == ~0)
1554     {
1555       /* If the file was entirely or basically ASCII, use the
1556          default value of `buffer-file-coding-system'. */
1557       Lisp_Object retval =
1558         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1559       if (!NILP (retval))
1560         {
1561           retval = Ffind_coding_system (retval);
1562           if (NILP (retval))
1563             {
1564               warn_when_safe
1565                 (Qbad_variable, Qwarning,
1566                  "Invalid `default-buffer-file-coding-system', set to nil");
1567               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1568             }
1569         }
1570       if (NILP (retval))
1571         retval = Fget_coding_system (Qraw_text);
1572       return retval;
1573     }
1574   else
1575     {
1576       int i;
1577       int cat = -1;
1578 #ifdef MULE
1579       mask = postprocess_iso2022_mask (mask);
1580 #endif
1581       /* Look through the coding categories by priority and find
1582          the first one that is allowed. */
1583       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1584         {
1585           cat = coding_category_by_priority[i];
1586           if ((mask & (1 << cat)) &&
1587               !NILP (coding_category_system[cat]))
1588             break;
1589         }
1590       if (cat >= 0)
1591         return coding_category_system[cat];
1592       else
1593         return Fget_coding_system (Qraw_text);
1594     }
1595 }
1596
1597 /* Given a seekable read stream and potential coding system and EOL type
1598    as specified, do any autodetection that is called for.  If the
1599    coding system and/or EOL type are not autodetect, they will be left
1600    alone; but this function will never return an autodetect coding system
1601    or EOL type.
1602
1603    This function does not automatically fetch subsidiary coding systems;
1604    that should be unnecessary with the explicit eol-type argument. */
1605
1606 void
1607 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1608                               enum eol_type *eol_type_in_out)
1609 {
1610   struct detection_state decst;
1611
1612   if (*eol_type_in_out == EOL_AUTODETECT)
1613     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1614
1615   xzero (decst);
1616   decst.eol_type = *eol_type_in_out;
1617   decst.mask = ~0;
1618
1619   /* If autodetection is called for, do it now. */
1620   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1621       *eol_type_in_out == EOL_AUTODETECT)
1622     {
1623       unsigned char random_buffer[4096];
1624       int nread;
1625       Lisp_Object coding_system = Qnil;
1626
1627       nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1628       if (nread)
1629         {
1630           unsigned char *cp = random_buffer;
1631
1632           while (cp < random_buffer + nread)
1633             {
1634               if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1635                   (*cp++ == 'o') && (cp < random_buffer + nread) &&
1636                   (*cp++ == 'd') && (cp < random_buffer + nread) &&
1637                   (*cp++ == 'i') && (cp < random_buffer + nread) &&
1638                   (*cp++ == 'n') && (cp < random_buffer + nread) &&
1639                   (*cp++ == 'g') && (cp < random_buffer + nread) &&
1640                   (*cp++ == ':') && (cp < random_buffer + nread))
1641                 {
1642                   unsigned char coding_system_name[4096 - 6];
1643                   unsigned char *np = coding_system_name;
1644
1645                   while ( (cp < random_buffer + nread)
1646                           && ((*cp == ' ') || (*cp == '\t')) )
1647                     {
1648                       cp++;
1649                     }
1650                   while ( (cp < random_buffer + nread) &&
1651                           (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1652                     {
1653                       *np++ = *cp++;
1654                     }
1655                   *np = 0;
1656                   coding_system
1657                     = Ffind_coding_system (intern (coding_system_name));
1658                   break;
1659                 }
1660             }
1661           if (EQ(coding_system, Qnil))
1662             do{
1663               if (detect_coding_type (&decst, random_buffer, nread,
1664                                       XCODING_SYSTEM_TYPE (*codesys_in_out)
1665                                       != CODESYS_AUTODETECT))
1666                 break;
1667               nread = Lstream_read (stream,
1668                                     random_buffer, sizeof (random_buffer));
1669               if (!nread)
1670                 break;
1671             } while(1);
1672         }
1673       *eol_type_in_out = decst.eol_type;
1674       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1675         {
1676           if (EQ(coding_system, Qnil))
1677             *codesys_in_out = coding_system_from_mask (decst.mask);
1678           else
1679             *codesys_in_out = coding_system;
1680         }
1681     }
1682   /* If we absolutely can't determine the EOL type, just assume LF. */
1683   if (*eol_type_in_out == EOL_AUTODETECT)
1684     *eol_type_in_out = EOL_LF;
1685
1686   Lstream_rewind (stream);
1687 }
1688
1689 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1690 Detect coding system of the text in the region between START and END.
1691 Returned a list of possible coding systems ordered by priority.
1692 If only ASCII characters are found, it returns 'undecided or one of
1693 its subsidiary coding systems according to a detected end-of-line
1694 type.  Optional arg BUFFER defaults to the current buffer.
1695 */
1696        (start, end, buffer))
1697 {
1698   Lisp_Object val = Qnil;
1699   struct buffer *buf = decode_buffer (buffer, 0);
1700   Bufpos b, e;
1701   Lisp_Object instream, lb_instream;
1702   Lstream *istr, *lb_istr;
1703   struct detection_state decst;
1704   struct gcpro gcpro1, gcpro2;
1705
1706   get_buffer_range_char (buf, start, end, &b, &e, 0);
1707   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1708   lb_istr = XLSTREAM (lb_instream);
1709   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1710   istr = XLSTREAM (instream);
1711   GCPRO2 (instream, lb_instream);
1712   xzero (decst);
1713   decst.eol_type = EOL_AUTODETECT;
1714   decst.mask = ~0;
1715   while (1)
1716     {
1717       unsigned char random_buffer[4096];
1718       int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1719
1720       if (!nread)
1721         break;
1722       if (detect_coding_type (&decst, random_buffer, nread, 0))
1723         break;
1724     }
1725
1726   if (decst.mask == ~0)
1727     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1728                                     decst.eol_type);
1729   else
1730     {
1731       int i;
1732
1733       val = Qnil;
1734 #ifdef MULE
1735       decst.mask = postprocess_iso2022_mask (decst.mask);
1736 #endif
1737       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1738         {
1739           int sys = coding_category_by_priority[i];
1740           if (decst.mask & (1 << sys))
1741             {
1742               Lisp_Object codesys = coding_category_system[sys];
1743               if (!NILP (codesys))
1744                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1745               val = Fcons (codesys, val);
1746             }
1747         }
1748     }
1749   Lstream_close (istr);
1750   UNGCPRO;
1751   Lstream_delete (istr);
1752   Lstream_delete (lb_istr);
1753   return val;
1754 }
1755
1756 \f
1757 /************************************************************************/
1758 /*           Converting to internal Mule format ("decoding")            */
1759 /************************************************************************/
1760
1761 /* A decoding stream is a stream used for decoding text (i.e.
1762    converting from some external format to internal format).
1763    The decoding-stream object keeps track of the actual coding
1764    stream, the stream that is at the other end, and data that
1765    needs to be persistent across the lifetime of the stream. */
1766
1767 /* Handle the EOL stuff related to just-read-in character C.
1768    EOL_TYPE is the EOL type of the coding stream.
1769    FLAGS is the current value of FLAGS in the coding stream, and may
1770    be modified by this macro.  (The macro only looks at the
1771    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1772    bytes are to be written.  You need to also define a local goto
1773    label "label_continue_loop" that is at the end of the main
1774    character-reading loop.
1775
1776    If C is a CR character, then this macro handles it entirely and
1777    jumps to label_continue_loop.  Otherwise, this macro does not add
1778    anything to DST, and continues normally.  You should continue
1779    processing C normally after this macro. */
1780
1781 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1782 do {                                                            \
1783   if (c == '\r')                                                \
1784     {                                                           \
1785       if (eol_type == EOL_CR)                                   \
1786         Dynarr_add (dst, '\n');                                 \
1787       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1788         Dynarr_add (dst, c);                                    \
1789       else                                                      \
1790         flags |= CODING_STATE_CR;                               \
1791       goto label_continue_loop;                                 \
1792     }                                                           \
1793   else if (flags & CODING_STATE_CR)                             \
1794     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1795       if (c != '\n')                                            \
1796         Dynarr_add (dst, '\r');                                 \
1797       flags &= ~CODING_STATE_CR;                                \
1798     }                                                           \
1799 } while (0)
1800
1801 /* C should be a binary character in the range 0 - 255; convert
1802    to internal format and add to Dynarr DST. */
1803
1804 #ifdef UTF2000
1805 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1806 do {                                            \
1807   if (BYTE_ASCII_P (c))                         \
1808     Dynarr_add (dst, c);                        \
1809   else                                          \
1810     {                                           \
1811       Dynarr_add (dst, (c >> 6) | 0xc0);        \
1812       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
1813     }                                           \
1814 } while (0)
1815
1816 INLINE void
1817 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1818 {
1819   if ( c <= 0x7f )
1820     {
1821       Dynarr_add (dst, c);
1822     }
1823   else if ( c <= 0x7ff )
1824     {
1825       Dynarr_add (dst, (c >> 6) | 0xc0);
1826       Dynarr_add (dst, (c & 0x3f) | 0x80);
1827     }
1828   else if ( c <= 0xffff )
1829     {
1830       Dynarr_add (dst,  (c >> 12) | 0xe0);
1831       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1832       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1833     }
1834   else if ( c <= 0x1fffff )
1835     {
1836       Dynarr_add (dst,  (c >> 18) | 0xf0);
1837       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1838       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1839       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1840     }
1841   else if ( c <= 0x3ffffff )
1842     {
1843       Dynarr_add (dst,  (c >> 24) | 0xf8);
1844       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1845       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1846       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1847       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1848     }
1849   else
1850     {
1851       Dynarr_add (dst,  (c >> 30) | 0xfc);
1852       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1853       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1854       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1855       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1856       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1857     }
1858 }
1859 #else
1860 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1861 do {                                            \
1862   if (BYTE_ASCII_P (c))                         \
1863     Dynarr_add (dst, c);                        \
1864   else if (BYTE_C1_P (c))                       \
1865     {                                           \
1866       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1867       Dynarr_add (dst, c + 0x20);               \
1868     }                                           \
1869   else                                          \
1870     {                                           \
1871       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1872       Dynarr_add (dst, c);                      \
1873     }                                           \
1874 } while (0)
1875 #endif
1876
1877 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1878 do {                                    \
1879   if (ch)                               \
1880     {                                   \
1881       DECODE_ADD_BINARY_CHAR (ch, dst); \
1882       ch = 0;                           \
1883     }                                   \
1884 } while (0)
1885
1886 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1887 do {                                    \
1888   if (flags & CODING_STATE_END)         \
1889     {                                   \
1890       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1891       if (flags & CODING_STATE_CR)      \
1892         Dynarr_add (dst, '\r');         \
1893     }                                   \
1894 } while (0)
1895
1896 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1897
1898 struct decoding_stream
1899 {
1900   /* Coding system that governs the conversion. */
1901   Lisp_Coding_System *codesys;
1902
1903   /* Stream that we read the encoded data from or
1904      write the decoded data to. */
1905   Lstream *other_end;
1906
1907   /* If we are reading, then we can return only a fixed amount of
1908      data, so if the conversion resulted in too much data, we store it
1909      here for retrieval the next time around. */
1910   unsigned_char_dynarr *runoff;
1911
1912   /* FLAGS holds flags indicating the current state of the decoding.
1913      Some of these flags are dependent on the coding system. */
1914   unsigned int flags;
1915
1916   /* CH holds a partially built-up character.  Since we only deal
1917      with one- and two-byte characters at the moment, we only use
1918      this to store the first byte of a two-byte character. */
1919   unsigned int ch;
1920
1921   /* EOL_TYPE specifies the type of end-of-line conversion that
1922      currently applies.  We need to keep this separate from the
1923      EOL type stored in CODESYS because the latter might indicate
1924      automatic EOL-type detection while the former will always
1925      indicate a particular EOL type. */
1926   enum eol_type eol_type;
1927 #ifdef MULE
1928   /* Additional ISO2022 information.  We define the structure above
1929      because it's also needed by the detection routines. */
1930   struct iso2022_decoder iso2022;
1931
1932   /* Additional information (the state of the running CCL program)
1933      used by the CCL decoder. */
1934   struct ccl_program ccl;
1935
1936   /* counter for UTF-8 or UCS-4 */
1937   unsigned char counter;
1938 #endif
1939   struct detection_state decst;
1940 };
1941
1942 static int decoding_reader     (Lstream *stream,       unsigned char *data, size_t size);
1943 static int decoding_writer     (Lstream *stream, CONST unsigned char *data, size_t size);
1944 static int decoding_rewinder   (Lstream *stream);
1945 static int decoding_seekable_p (Lstream *stream);
1946 static int decoding_flusher    (Lstream *stream);
1947 static int decoding_closer     (Lstream *stream);
1948
1949 static Lisp_Object decoding_marker (Lisp_Object stream,
1950                                     void (*markobj) (Lisp_Object));
1951
1952 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1953                                sizeof (struct decoding_stream));
1954
1955 static Lisp_Object
1956 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1957 {
1958   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1959   Lisp_Object str_obj;
1960
1961   /* We do not need to mark the coding systems or charsets stored
1962      within the stream because they are stored in a global list
1963      and automatically marked. */
1964
1965   XSETLSTREAM (str_obj, str);
1966   markobj (str_obj);
1967   if (str->imp->marker)
1968     return (str->imp->marker) (str_obj, markobj);
1969   else
1970     return Qnil;
1971 }
1972
1973 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1974    so we read data from the other end, decode it, and store it into DATA. */
1975
1976 static int
1977 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1978 {
1979   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1980   unsigned char *orig_data = data;
1981   int read_size;
1982   int error_occurred = 0;
1983
1984   /* We need to interface to mule_decode(), which expects to take some
1985      amount of data and store the result into a Dynarr.  We have
1986      mule_decode() store into str->runoff, and take data from there
1987      as necessary. */
1988
1989   /* We loop until we have enough data, reading chunks from the other
1990      end and decoding it. */
1991   while (1)
1992     {
1993       /* Take data from the runoff if we can.  Make sure to take at
1994          most SIZE bytes, and delete the data from the runoff. */
1995       if (Dynarr_length (str->runoff) > 0)
1996         {
1997           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1998           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1999           Dynarr_delete_many (str->runoff, 0, chunk);
2000           data += chunk;
2001           size -= chunk;
2002         }
2003
2004       if (size == 0)
2005         break; /* No more room for data */
2006
2007       if (str->flags & CODING_STATE_END)
2008         /* This means that on the previous iteration, we hit the EOF on
2009            the other end.  We loop once more so that mule_decode() can
2010            output any final stuff it may be holding, or any "go back
2011            to a sane state" escape sequences. (This latter makes sense
2012            during encoding.) */
2013         break;
2014
2015       /* Exhausted the runoff, so get some more.  DATA has at least
2016          SIZE bytes left of storage in it, so it's OK to read directly
2017          into it.  (We'll be overwriting above, after we've decoded it
2018          into the runoff.) */
2019       read_size = Lstream_read (str->other_end, data, size);
2020       if (read_size < 0)
2021         {
2022           error_occurred = 1;
2023           break;
2024         }
2025       if (read_size == 0)
2026         /* There might be some more end data produced in the translation.
2027            See the comment above. */
2028         str->flags |= CODING_STATE_END;
2029       mule_decode (stream, data, str->runoff, read_size);
2030     }
2031
2032   if (data - orig_data == 0)
2033     return error_occurred ? -1 : 0;
2034   else
2035     return data - orig_data;
2036 }
2037
2038 static int
2039 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2040 {
2041   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2042   int retval;
2043
2044   /* Decode all our data into the runoff, and then attempt to write
2045      it all out to the other end.  Remove whatever chunk we succeeded
2046      in writing. */
2047   mule_decode (stream, data, str->runoff, size);
2048   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2049                           Dynarr_length (str->runoff));
2050   if (retval > 0)
2051     Dynarr_delete_many (str->runoff, 0, retval);
2052   /* Do NOT return retval.  The return value indicates how much
2053      of the incoming data was written, not how many bytes were
2054      written. */
2055   return size;
2056 }
2057
2058 static void
2059 reset_decoding_stream (struct decoding_stream *str)
2060 {
2061 #ifdef MULE
2062   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2063     {
2064       Lisp_Object coding_system;
2065       XSETCODING_SYSTEM (coding_system, str->codesys);
2066       reset_iso2022 (coding_system, &str->iso2022);
2067     }
2068   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2069     {
2070       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2071     }
2072   str->counter = 0;
2073 #endif /* MULE */
2074   str->flags = str->ch = 0;
2075 }
2076
2077 static int
2078 decoding_rewinder (Lstream *stream)
2079 {
2080   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2081   reset_decoding_stream (str);
2082   Dynarr_reset (str->runoff);
2083   return Lstream_rewind (str->other_end);
2084 }
2085
2086 static int
2087 decoding_seekable_p (Lstream *stream)
2088 {
2089   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2090   return Lstream_seekable_p (str->other_end);
2091 }
2092
2093 static int
2094 decoding_flusher (Lstream *stream)
2095 {
2096   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2097   return Lstream_flush (str->other_end);
2098 }
2099
2100 static int
2101 decoding_closer (Lstream *stream)
2102 {
2103   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2104   if (stream->flags & LSTREAM_FL_WRITE)
2105     {
2106       str->flags |= CODING_STATE_END;
2107       decoding_writer (stream, 0, 0);
2108     }
2109   Dynarr_free (str->runoff);
2110 #ifdef MULE
2111 #ifdef ENABLE_COMPOSITE_CHARS
2112   if (str->iso2022.composite_chars)
2113     Dynarr_free (str->iso2022.composite_chars);
2114 #endif
2115 #endif
2116   return Lstream_close (str->other_end);
2117 }
2118
2119 Lisp_Object
2120 decoding_stream_coding_system (Lstream *stream)
2121 {
2122   Lisp_Object coding_system;
2123   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2124
2125   XSETCODING_SYSTEM (coding_system, str->codesys);
2126   return subsidiary_coding_system (coding_system, str->eol_type);
2127 }
2128
2129 void
2130 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2131 {
2132   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2133   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2134   str->codesys = cs;
2135   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2136     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2137   reset_decoding_stream (str);
2138 }
2139
2140 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2141    stream for writing, no automatic code detection will be performed.
2142    The reason for this is that automatic code detection requires a
2143    seekable input.  Things will also fail if you open a decoding
2144    stream for reading using a non-fully-specified coding system and
2145    a non-seekable input stream. */
2146
2147 static Lisp_Object
2148 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2149                         CONST char *mode)
2150 {
2151   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2152   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2153   Lisp_Object obj;
2154
2155   xzero (*str);
2156   str->other_end = stream;
2157   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2158   str->eol_type = EOL_AUTODETECT;
2159   if (!strcmp (mode, "r")
2160       && Lstream_seekable_p (stream))
2161     /* We can determine the coding system now. */
2162     determine_real_coding_system (stream, &codesys, &str->eol_type);
2163   set_decoding_stream_coding_system (lstr, codesys);
2164   str->decst.eol_type = str->eol_type;
2165   str->decst.mask = ~0;
2166   XSETLSTREAM (obj, lstr);
2167   return obj;
2168 }
2169
2170 Lisp_Object
2171 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2172 {
2173   return make_decoding_stream_1 (stream, codesys, "r");
2174 }
2175
2176 Lisp_Object
2177 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2178 {
2179   return make_decoding_stream_1 (stream, codesys, "w");
2180 }
2181
2182 /* Note: the decode_coding_* functions all take the same
2183    arguments as mule_decode(), which is to say some SRC data of
2184    size N, which is to be stored into dynamic array DST.
2185    DECODING is the stream within which the decoding is
2186    taking place, but no data is actually read from or
2187    written to that stream; that is handled in decoding_reader()
2188    or decoding_writer().  This allows the same functions to
2189    be used for both reading and writing. */
2190
2191 static void
2192 mule_decode (Lstream *decoding, CONST unsigned char *src,
2193              unsigned_char_dynarr *dst, unsigned int n)
2194 {
2195   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2196
2197   /* If necessary, do encoding-detection now.  We do this when
2198      we're a writing stream or a non-seekable reading stream,
2199      meaning that we can't just process the whole input,
2200      rewind, and start over. */
2201
2202   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2203       str->eol_type == EOL_AUTODETECT)
2204     {
2205       Lisp_Object codesys;
2206
2207       XSETCODING_SYSTEM (codesys, str->codesys);
2208       detect_coding_type (&str->decst, src, n,
2209                           CODING_SYSTEM_TYPE (str->codesys) !=
2210                           CODESYS_AUTODETECT);
2211       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2212           str->decst.mask != ~0)
2213         /* #### This is cheesy.  What we really ought to do is
2214            buffer up a certain amount of data so as to get a
2215            less random result. */
2216         codesys = coding_system_from_mask (str->decst.mask);
2217       str->eol_type = str->decst.eol_type;
2218       if (XCODING_SYSTEM (codesys) != str->codesys)
2219         {
2220           /* Preserve the CODING_STATE_END flag in case it was set.
2221              If we erase it, bad things might happen. */
2222           int was_end = str->flags & CODING_STATE_END;
2223           set_decoding_stream_coding_system (decoding, codesys);
2224           if (was_end)
2225             str->flags |= CODING_STATE_END;
2226         }
2227     }
2228
2229   switch (CODING_SYSTEM_TYPE (str->codesys))
2230     {
2231 #ifdef DEBUG_XEMACS
2232     case CODESYS_INTERNAL:
2233       Dynarr_add_many (dst, src, n);
2234       break;
2235 #endif
2236     case CODESYS_AUTODETECT:
2237       /* If we got this far and still haven't decided on the coding
2238          system, then do no conversion. */
2239     case CODESYS_NO_CONVERSION:
2240       decode_coding_no_conversion (decoding, src, dst, n);
2241       break;
2242 #ifdef MULE
2243     case CODESYS_SHIFT_JIS:
2244       decode_coding_sjis (decoding, src, dst, n);
2245       break;
2246     case CODESYS_BIG5:
2247       decode_coding_big5 (decoding, src, dst, n);
2248       break;
2249     case CODESYS_UCS4:
2250       decode_coding_ucs4 (decoding, src, dst, n);
2251       break;
2252     case CODESYS_UTF8:
2253       decode_coding_utf8 (decoding, src, dst, n);
2254       break;
2255     case CODESYS_CCL:
2256       str->ccl.last_block = str->flags & CODING_STATE_END;
2257       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2258       break;
2259     case CODESYS_ISO2022:
2260       decode_coding_iso2022 (decoding, src, dst, n);
2261       break;
2262 #endif /* MULE */
2263     default:
2264       abort ();
2265     }
2266 }
2267
2268 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2269 Decode the text between START and END which is encoded in CODING-SYSTEM.
2270 This is useful if you've read in encoded text from a file without decoding
2271 it (e.g. you read in a JIS-formatted file but used the `binary' or
2272 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2273 Return length of decoded text.
2274 BUFFER defaults to the current buffer if unspecified.
2275 */
2276        (start, end, coding_system, buffer))
2277 {
2278   Bufpos b, e;
2279   struct buffer *buf = decode_buffer (buffer, 0);
2280   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2281   Lstream *istr, *ostr;
2282   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2283
2284   get_buffer_range_char (buf, start, end, &b, &e, 0);
2285
2286   barf_if_buffer_read_only (buf, b, e);
2287
2288   coding_system = Fget_coding_system (coding_system);
2289   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2290   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2291   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2292                                               coding_system);
2293   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2294                                            Fget_coding_system (Qbinary));
2295   istr = XLSTREAM (instream);
2296   ostr = XLSTREAM (outstream);
2297   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2298
2299   /* The chain of streams looks like this:
2300
2301      [BUFFER] <----- send through
2302                      ------> [ENCODE AS BINARY]
2303                              ------> [DECODE AS SPECIFIED]
2304                                      ------> [BUFFER]
2305    */
2306
2307   while (1)
2308     {
2309       char tempbuf[1024]; /* some random amount */
2310       Bufpos newpos, even_newer_pos;
2311       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2312       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2313
2314       if (!size_in_bytes)
2315         break;
2316       newpos = lisp_buffer_stream_startpos (istr);
2317       Lstream_write (ostr, tempbuf, size_in_bytes);
2318       even_newer_pos = lisp_buffer_stream_startpos (istr);
2319       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2320                            even_newer_pos, 0);
2321     }
2322   Lstream_close (istr);
2323   Lstream_close (ostr);
2324   UNGCPRO;
2325   Lstream_delete (istr);
2326   Lstream_delete (ostr);
2327   Lstream_delete (XLSTREAM (de_outstream));
2328   Lstream_delete (XLSTREAM (lb_outstream));
2329   return Qnil;
2330 }
2331
2332 \f
2333 /************************************************************************/
2334 /*           Converting to an external encoding ("encoding")            */
2335 /************************************************************************/
2336
2337 /* An encoding stream is an output stream.  When you create the
2338    stream, you specify the coding system that governs the encoding
2339    and another stream that the resulting encoded data is to be
2340    sent to, and then start sending data to it. */
2341
2342 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2343
2344 struct encoding_stream
2345 {
2346   /* Coding system that governs the conversion. */
2347   Lisp_Coding_System *codesys;
2348
2349   /* Stream that we read the encoded data from or
2350      write the decoded data to. */
2351   Lstream *other_end;
2352
2353   /* If we are reading, then we can return only a fixed amount of
2354      data, so if the conversion resulted in too much data, we store it
2355      here for retrieval the next time around. */
2356   unsigned_char_dynarr *runoff;
2357
2358   /* FLAGS holds flags indicating the current state of the encoding.
2359      Some of these flags are dependent on the coding system. */
2360   unsigned int flags;
2361
2362   /* CH holds a partially built-up character.  Since we only deal
2363      with one- and two-byte characters at the moment, we only use
2364      this to store the first byte of a two-byte character. */
2365   unsigned int ch;
2366 #ifdef MULE
2367   /* Additional information used by the ISO2022 encoder. */
2368   struct
2369     {
2370       /* CHARSET holds the character sets currently assigned to the G0
2371          through G3 registers.  It is initialized from the array
2372          INITIAL_CHARSET in CODESYS. */
2373       Lisp_Object charset[4];
2374
2375       /* Which registers are currently invoked into the left (GL) and
2376          right (GR) halves of the 8-bit encoding space? */
2377       int register_left, register_right;
2378
2379       /* Whether we need to explicitly designate the charset in the
2380          G? register before using it.  It is initialized from the
2381          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2382       unsigned char force_charset_on_output[4];
2383
2384       /* Other state variables that need to be preserved across
2385          invocations. */
2386       Lisp_Object current_charset;
2387       int current_half;
2388       int current_char_boundary;
2389     } iso2022;
2390
2391   /* Additional information (the state of the running CCL program)
2392      used by the CCL encoder. */
2393   struct ccl_program ccl;
2394 #endif /* MULE */
2395 };
2396
2397 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2398 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2399                             size_t size);
2400 static int encoding_rewinder   (Lstream *stream);
2401 static int encoding_seekable_p (Lstream *stream);
2402 static int encoding_flusher    (Lstream *stream);
2403 static int encoding_closer     (Lstream *stream);
2404
2405 static Lisp_Object encoding_marker (Lisp_Object stream,
2406                                     void (*markobj) (Lisp_Object));
2407
2408 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2409                                sizeof (struct encoding_stream));
2410
2411 static Lisp_Object
2412 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2413 {
2414   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2415   Lisp_Object str_obj;
2416
2417   /* We do not need to mark the coding systems or charsets stored
2418      within the stream because they are stored in a global list
2419      and automatically marked. */
2420
2421   XSETLSTREAM (str_obj, str);
2422   markobj (str_obj);
2423   if (str->imp->marker)
2424     return (str->imp->marker) (str_obj, markobj);
2425   else
2426     return Qnil;
2427 }
2428
2429 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2430    so we read data from the other end, encode it, and store it into DATA. */
2431
2432 static int
2433 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2434 {
2435   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2436   unsigned char *orig_data = data;
2437   int read_size;
2438   int error_occurred = 0;
2439
2440   /* We need to interface to mule_encode(), which expects to take some
2441      amount of data and store the result into a Dynarr.  We have
2442      mule_encode() store into str->runoff, and take data from there
2443      as necessary. */
2444
2445   /* We loop until we have enough data, reading chunks from the other
2446      end and encoding it. */
2447   while (1)
2448     {
2449       /* Take data from the runoff if we can.  Make sure to take at
2450          most SIZE bytes, and delete the data from the runoff. */
2451       if (Dynarr_length (str->runoff) > 0)
2452         {
2453           int chunk = min ((int) size, Dynarr_length (str->runoff));
2454           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2455           Dynarr_delete_many (str->runoff, 0, chunk);
2456           data += chunk;
2457           size -= chunk;
2458         }
2459
2460       if (size == 0)
2461         break; /* No more room for data */
2462
2463       if (str->flags & CODING_STATE_END)
2464         /* This means that on the previous iteration, we hit the EOF on
2465            the other end.  We loop once more so that mule_encode() can
2466            output any final stuff it may be holding, or any "go back
2467            to a sane state" escape sequences. (This latter makes sense
2468            during encoding.) */
2469         break;
2470
2471       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2472          left of storage in it, so it's OK to read directly into it.
2473          (We'll be overwriting above, after we've encoded it into the
2474          runoff.) */
2475       read_size = Lstream_read (str->other_end, data, size);
2476       if (read_size < 0)
2477         {
2478           error_occurred = 1;
2479           break;
2480         }
2481       if (read_size == 0)
2482         /* There might be some more end data produced in the translation.
2483            See the comment above. */
2484         str->flags |= CODING_STATE_END;
2485       mule_encode (stream, data, str->runoff, read_size);
2486     }
2487
2488   if (data == orig_data)
2489     return error_occurred ? -1 : 0;
2490   else
2491     return data - orig_data;
2492 }
2493
2494 static int
2495 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2496 {
2497   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2498   int retval;
2499
2500   /* Encode all our data into the runoff, and then attempt to write
2501      it all out to the other end.  Remove whatever chunk we succeeded
2502      in writing. */
2503   mule_encode (stream, data, str->runoff, size);
2504   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2505                           Dynarr_length (str->runoff));
2506   if (retval > 0)
2507     Dynarr_delete_many (str->runoff, 0, retval);
2508   /* Do NOT return retval.  The return value indicates how much
2509      of the incoming data was written, not how many bytes were
2510      written. */
2511   return size;
2512 }
2513
2514 static void
2515 reset_encoding_stream (struct encoding_stream *str)
2516 {
2517 #ifdef MULE
2518   switch (CODING_SYSTEM_TYPE (str->codesys))
2519     {
2520     case CODESYS_ISO2022:
2521       {
2522         int i;
2523
2524         for (i = 0; i < 4; i++)
2525           {
2526             str->iso2022.charset[i] =
2527               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2528             str->iso2022.force_charset_on_output[i] =
2529               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2530           }
2531         str->iso2022.register_left = 0;
2532         str->iso2022.register_right = 1;
2533         str->iso2022.current_charset = Qnil;
2534         str->iso2022.current_half = 0;
2535 #ifdef UTF2000
2536         str->iso2022.current_char_boundary = 0;
2537 #else
2538         str->iso2022.current_char_boundary = 1;
2539 #endif
2540         break;
2541       }
2542     case CODESYS_CCL:
2543       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2544       break;
2545     default:
2546       break;
2547     }
2548 #endif /* MULE */
2549
2550   str->flags = str->ch = 0;
2551 }
2552
2553 static int
2554 encoding_rewinder (Lstream *stream)
2555 {
2556   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2557   reset_encoding_stream (str);
2558   Dynarr_reset (str->runoff);
2559   return Lstream_rewind (str->other_end);
2560 }
2561
2562 static int
2563 encoding_seekable_p (Lstream *stream)
2564 {
2565   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2566   return Lstream_seekable_p (str->other_end);
2567 }
2568
2569 static int
2570 encoding_flusher (Lstream *stream)
2571 {
2572   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2573   return Lstream_flush (str->other_end);
2574 }
2575
2576 static int
2577 encoding_closer (Lstream *stream)
2578 {
2579   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2580   if (stream->flags & LSTREAM_FL_WRITE)
2581     {
2582       str->flags |= CODING_STATE_END;
2583       encoding_writer (stream, 0, 0);
2584     }
2585   Dynarr_free (str->runoff);
2586   return Lstream_close (str->other_end);
2587 }
2588
2589 Lisp_Object
2590 encoding_stream_coding_system (Lstream *stream)
2591 {
2592   Lisp_Object coding_system;
2593   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2594
2595   XSETCODING_SYSTEM (coding_system, str->codesys);
2596   return coding_system;
2597 }
2598
2599 void
2600 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2601 {
2602   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2603   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2604   str->codesys = cs;
2605   reset_encoding_stream (str);
2606 }
2607
2608 static Lisp_Object
2609 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2610                         CONST char *mode)
2611 {
2612   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2613   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2614   Lisp_Object obj;
2615
2616   xzero (*str);
2617   str->runoff = Dynarr_new (unsigned_char);
2618   str->other_end = stream;
2619   set_encoding_stream_coding_system (lstr, codesys);
2620   XSETLSTREAM (obj, lstr);
2621   return obj;
2622 }
2623
2624 Lisp_Object
2625 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2626 {
2627   return make_encoding_stream_1 (stream, codesys, "r");
2628 }
2629
2630 Lisp_Object
2631 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2632 {
2633   return make_encoding_stream_1 (stream, codesys, "w");
2634 }
2635
2636 /* Convert N bytes of internally-formatted data stored in SRC to an
2637    external format, according to the encoding stream ENCODING.
2638    Store the encoded data into DST. */
2639
2640 static void
2641 mule_encode (Lstream *encoding, CONST unsigned char *src,
2642              unsigned_char_dynarr *dst, unsigned int n)
2643 {
2644   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2645
2646   switch (CODING_SYSTEM_TYPE (str->codesys))
2647     {
2648 #ifdef DEBUG_XEMACS
2649     case CODESYS_INTERNAL:
2650       Dynarr_add_many (dst, src, n);
2651       break;
2652 #endif
2653     case CODESYS_AUTODETECT:
2654       /* If we got this far and still haven't decided on the coding
2655          system, then do no conversion. */
2656     case CODESYS_NO_CONVERSION:
2657       encode_coding_no_conversion (encoding, src, dst, n);
2658       break;
2659 #ifdef MULE
2660     case CODESYS_SHIFT_JIS:
2661       encode_coding_sjis (encoding, src, dst, n);
2662       break;
2663     case CODESYS_BIG5:
2664       encode_coding_big5 (encoding, src, dst, n);
2665       break;
2666     case CODESYS_UCS4:
2667       encode_coding_ucs4 (encoding, src, dst, n);
2668       break;
2669     case CODESYS_UTF8:
2670       encode_coding_utf8 (encoding, src, dst, n);
2671       break;
2672     case CODESYS_CCL:
2673       str->ccl.last_block = str->flags & CODING_STATE_END;
2674       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2675       break;
2676     case CODESYS_ISO2022:
2677       encode_coding_iso2022 (encoding, src, dst, n);
2678       break;
2679 #endif /* MULE */
2680     default:
2681       abort ();
2682     }
2683 }
2684
2685 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2686 Encode the text between START and END using CODING-SYSTEM.
2687 This will, for example, convert Japanese characters into stuff such as
2688 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2689 text.  BUFFER defaults to the current buffer if unspecified.
2690 */
2691        (start, end, coding_system, buffer))
2692 {
2693   Bufpos b, e;
2694   struct buffer *buf = decode_buffer (buffer, 0);
2695   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2696   Lstream *istr, *ostr;
2697   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2698
2699   get_buffer_range_char (buf, start, end, &b, &e, 0);
2700
2701   barf_if_buffer_read_only (buf, b, e);
2702
2703   coding_system = Fget_coding_system (coding_system);
2704   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2705   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2706   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2707                                               Fget_coding_system (Qbinary));
2708   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2709                                            coding_system);
2710   istr = XLSTREAM (instream);
2711   ostr = XLSTREAM (outstream);
2712   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2713   /* The chain of streams looks like this:
2714
2715      [BUFFER] <----- send through
2716                      ------> [ENCODE AS SPECIFIED]
2717                              ------> [DECODE AS BINARY]
2718                                      ------> [BUFFER]
2719    */
2720   while (1)
2721     {
2722       char tempbuf[1024]; /* some random amount */
2723       Bufpos newpos, even_newer_pos;
2724       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2725       int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2726
2727       if (!size_in_bytes)
2728         break;
2729       newpos = lisp_buffer_stream_startpos (istr);
2730       Lstream_write (ostr, tempbuf, size_in_bytes);
2731       even_newer_pos = lisp_buffer_stream_startpos (istr);
2732       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2733                            even_newer_pos, 0);
2734     }
2735
2736   {
2737     Charcount retlen =
2738       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2739     Lstream_close (istr);
2740     Lstream_close (ostr);
2741     UNGCPRO;
2742     Lstream_delete (istr);
2743     Lstream_delete (ostr);
2744     Lstream_delete (XLSTREAM (de_outstream));
2745     Lstream_delete (XLSTREAM (lb_outstream));
2746     return make_int (retlen);
2747   }
2748 }
2749
2750 #ifdef MULE
2751 \f
2752 /************************************************************************/
2753 /*                          Shift-JIS methods                           */
2754 /************************************************************************/
2755
2756 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2757    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2758    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2759    encoded by "position-code + 0x80".  A character of JISX0208
2760    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2761    position-codes are divided and shifted so that it fit in the range
2762    below.
2763
2764    --- CODE RANGE of Shift-JIS ---
2765    (character set)      (range)
2766    ASCII                0x00 .. 0x7F
2767    JISX0201-Kana        0xA0 .. 0xDF
2768    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2769             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2770    -------------------------------
2771
2772 */
2773
2774 /* Is this the first byte of a Shift-JIS two-byte char? */
2775
2776 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2777   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2778
2779 /* Is this the second byte of a Shift-JIS two-byte char? */
2780
2781 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2782   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2783
2784 #define BYTE_SJIS_KATAKANA_P(c) \
2785   ((c) >= 0xA1 && (c) <= 0xDF)
2786
2787 static int
2788 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2789                     unsigned int n)
2790 {
2791   int c;
2792
2793   while (n--)
2794     {
2795       c = *src++;
2796       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2797         return 0;
2798       if (st->shift_jis.in_second_byte)
2799         {
2800           st->shift_jis.in_second_byte = 0;
2801           if (c < 0x40)
2802             return 0;
2803         }
2804       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2805         st->shift_jis.in_second_byte = 1;
2806     }
2807   return CODING_CATEGORY_SHIFT_JIS_MASK;
2808 }
2809
2810 /* Convert Shift-JIS data to internal format. */
2811
2812 static void
2813 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2814                     unsigned_char_dynarr *dst, unsigned int n)
2815 {
2816   unsigned char c;
2817   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2818   unsigned int flags  = str->flags;
2819   unsigned int ch     = str->ch;
2820   eol_type_t eol_type = str->eol_type;
2821
2822   while (n--)
2823     {
2824       c = *src++;
2825
2826       if (ch)
2827         {
2828           /* Previous character was first byte of Shift-JIS Kanji char. */
2829           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2830             {
2831               unsigned char e1, e2;
2832
2833               DECODE_SJIS (ch, c, e1, e2);
2834 #ifdef UTF2000
2835               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2836                                             e1 & 0x7F,
2837                                             e2 & 0x7F), dst);
2838 #else
2839               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2840               Dynarr_add (dst, e1);
2841               Dynarr_add (dst, e2);
2842 #endif
2843             }
2844           else
2845             {
2846               DECODE_ADD_BINARY_CHAR (ch, dst);
2847               DECODE_ADD_BINARY_CHAR (c, dst);
2848             }
2849           ch = 0;
2850         }
2851       else
2852         {
2853           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2854           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2855             ch = c;
2856           else if (BYTE_SJIS_KATAKANA_P (c))
2857             {
2858 #ifdef UTF2000
2859               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2860                                             c & 0x7F, 0), dst);
2861 #else
2862               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2863               Dynarr_add (dst, c);
2864 #endif
2865             }
2866           else
2867             DECODE_ADD_BINARY_CHAR (c, dst);
2868         }
2869     label_continue_loop:;
2870     }
2871
2872   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2873
2874   str->flags = flags;
2875   str->ch    = ch;
2876 }
2877
2878 /* Convert internally-formatted data to Shift-JIS. */
2879
2880 static void
2881 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2882                     unsigned_char_dynarr *dst, unsigned int n)
2883 {
2884   unsigned char c;
2885   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2886   unsigned int flags  = str->flags;
2887   unsigned int ch     = str->ch;
2888   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2889 #ifdef UTF2000
2890   unsigned char char_boundary = str->iso2022.current_char_boundary;
2891 #endif
2892
2893   while (n--)
2894     {
2895       c = *src++;
2896 #ifdef UTF2000
2897       switch (char_boundary)
2898         {
2899         case 0:
2900           if ( c >= 0xfc )
2901             {
2902               ch = c & 0x01;
2903               char_boundary = 5;
2904             }
2905           else if ( c >= 0xf8 )
2906             {
2907               ch = c & 0x03;
2908               char_boundary = 4;
2909             }
2910           else if ( c >= 0xf0 )
2911             {
2912               ch = c & 0x07;
2913               char_boundary = 3;
2914             }
2915           else if ( c >= 0xe0 )
2916             {
2917               ch = c & 0x0f;
2918               char_boundary = 2;
2919             }
2920           else if ( c >= 0xc0 )
2921             {
2922               ch = c & 0x1f;
2923               char_boundary = 1;
2924             }
2925           else
2926             {
2927               ch = 0;
2928               if (c == '\n')
2929                 {
2930                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2931                     Dynarr_add (dst, '\r');
2932                   if (eol_type != EOL_CR)
2933                     Dynarr_add (dst, c);
2934                 }
2935               else
2936                 Dynarr_add (dst, c);
2937               char_boundary = 0;
2938             }
2939           break;
2940         case 1:
2941           ch = ( ch << 6 ) | ( c & 0x3f );
2942           {
2943             Lisp_Object charset;
2944             unsigned int c1, c2, s1, s2;
2945
2946             BREAKUP_CHAR (ch, charset, c1, c2);
2947             if (EQ(charset, Vcharset_katakana_jisx0201))
2948               {
2949                 Dynarr_add (dst, c1 | 0x80);
2950               }
2951             else if (EQ(charset, Vcharset_japanese_jisx0208))
2952               {
2953                 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2954                 Dynarr_add (dst, s1);
2955                 Dynarr_add (dst, s2);
2956               }
2957           }
2958           char_boundary = 0;
2959           break;
2960         default:
2961           ch = ( ch << 6 ) | ( c & 0x3f );
2962           char_boundary--;
2963         }
2964 #else
2965       if (c == '\n')
2966         {
2967           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2968             Dynarr_add (dst, '\r');
2969           if (eol_type != EOL_CR)
2970             Dynarr_add (dst, '\n');
2971           ch = 0;
2972         }
2973       else if (BYTE_ASCII_P (c))
2974         {
2975           Dynarr_add (dst, c);
2976           ch = 0;
2977         }
2978       else if (BUFBYTE_LEADING_BYTE_P (c))
2979         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2980               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2981               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2982       else if (ch)
2983         {
2984           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2985             {
2986               Dynarr_add (dst, c);
2987               ch = 0;
2988             }
2989           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2990                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2991             ch = c;
2992           else
2993             {
2994               unsigned char j1, j2;
2995               ENCODE_SJIS (ch, c, j1, j2);
2996               Dynarr_add (dst, j1);
2997               Dynarr_add (dst, j2);
2998               ch = 0;
2999             }
3000         }
3001 #endif
3002     }
3003
3004   str->flags = flags;
3005   str->ch    = ch;
3006 #ifdef UTF2000
3007   str->iso2022.current_char_boundary = char_boundary;
3008 #endif
3009 }
3010
3011 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3012 Decode a JISX0208 character of Shift-JIS coding-system.
3013 CODE is the character code in Shift-JIS as a cons of type bytes.
3014 Return the corresponding character.
3015 */
3016        (code))
3017 {
3018   unsigned char c1, c2, s1, s2;
3019
3020   CHECK_CONS (code);
3021   CHECK_INT (XCAR (code));
3022   CHECK_INT (XCDR (code));
3023   s1 = XINT (XCAR (code));
3024   s2 = XINT (XCDR (code));
3025   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3026       BYTE_SJIS_TWO_BYTE_2_P (s2))
3027     {
3028       DECODE_SJIS (s1, s2, c1, c2);
3029       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3030                                    c1 & 0x7F, c2 & 0x7F));
3031     }
3032   else
3033     return Qnil;
3034 }
3035
3036 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3037 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3038 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3039 */
3040        (ch))
3041 {
3042   Lisp_Object charset;
3043   int c1, c2, s1, s2;
3044
3045   CHECK_CHAR_COERCE_INT (ch);
3046   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3047   if (EQ (charset, Vcharset_japanese_jisx0208))
3048     {
3049       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3050       return Fcons (make_int (s1), make_int (s2));
3051     }
3052   else
3053     return Qnil;
3054 }
3055
3056 \f
3057 /************************************************************************/
3058 /*                            Big5 methods                              */
3059 /************************************************************************/
3060
3061 /* BIG5 is a coding system encoding two character sets: ASCII and
3062    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3063    character set and is encoded in two-byte.
3064
3065    --- CODE RANGE of BIG5 ---
3066    (character set)      (range)
3067    ASCII                0x00 .. 0x7F
3068    Big5 (1st byte)      0xA1 .. 0xFE
3069         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3070    --------------------------
3071
3072    Since the number of characters in Big5 is larger than maximum
3073    characters in Emacs' charset (96x96), it can't be handled as one
3074    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3075    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3076    contains frequently used characters and the latter contains less
3077    frequently used characters.  */
3078
3079 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3080   ((c) >= 0xA1 && (c) <= 0xFE)
3081
3082 /* Is this the second byte of a Shift-JIS two-byte char? */
3083
3084 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3085   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3086
3087 /* Number of Big5 characters which have the same code in 1st byte.  */
3088
3089 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3090
3091 /* Code conversion macros.  These are macros because they are used in
3092    inner loops during code conversion.
3093
3094    Note that temporary variables in macros introduce the classic
3095    dynamic-scoping problems with variable names.  We use capital-
3096    lettered variables in the assumption that XEmacs does not use
3097    capital letters in variables except in a very formalized way
3098    (e.g. Qstring). */
3099
3100 /* Convert Big5 code (b1, b2) into its internal string representation
3101    (lb, c1, c2). */
3102
3103 /* There is a much simpler way to split the Big5 charset into two.
3104    For the moment I'm going to leave the algorithm as-is because it
3105    claims to separate out the most-used characters into a single
3106    charset, which perhaps will lead to optimizations in various
3107    places.
3108
3109    The way the algorithm works is something like this:
3110
3111    Big5 can be viewed as a 94x157 charset, where the row is
3112    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3113    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3114    the split between low and high column numbers is apparently
3115    meaningless; ascending rows produce less and less frequent chars.
3116    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3117    the first charset, and the upper half (0xC9 .. 0xFE) to the
3118    second.  To do the conversion, we convert the character into
3119    a single number where 0 .. 156 is the first row, 157 .. 313
3120    is the second, etc.  That way, the characters are ordered by
3121    decreasing frequency.  Then we just chop the space in two
3122    and coerce the result into a 94x94 space.
3123    */
3124
3125 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3126 {                                                                       \
3127   int B1 = b1, B2 = b2;                                                 \
3128   unsigned int I                                                        \
3129     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3130                                                                         \
3131   if (B1 < 0xC9)                                                        \
3132     {                                                                   \
3133       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3134     }                                                                   \
3135   else                                                                  \
3136     {                                                                   \
3137       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3138       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3139     }                                                                   \
3140   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3141   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3142 } while (0)
3143
3144 /* Convert the internal string representation of a Big5 character
3145    (lb, c1, c2) into Big5 code (b1, b2). */
3146
3147 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3148 {                                                                       \
3149   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3150                                                                         \
3151   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3152     {                                                                   \
3153       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3154     }                                                                   \
3155   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3156   b2 = I % BIG5_SAME_ROW;                                               \
3157   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3158 } while (0)
3159
3160 static int
3161 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3162                     unsigned int n)
3163 {
3164   int c;
3165
3166   while (n--)
3167     {
3168       c = *src++;
3169       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3170           (c >= 0x80 && c <= 0xA0))
3171         return 0;
3172       if (st->big5.in_second_byte)
3173         {
3174           st->big5.in_second_byte = 0;
3175           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3176             return 0;
3177         }
3178       else if (c >= 0xA1)
3179         st->big5.in_second_byte = 1;
3180     }
3181   return CODING_CATEGORY_BIG5_MASK;
3182 }
3183
3184 /* Convert Big5 data to internal format. */
3185
3186 static void
3187 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3188                     unsigned_char_dynarr *dst, unsigned int n)
3189 {
3190   unsigned char c;
3191   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3192   unsigned int flags  = str->flags;
3193   unsigned int ch     = str->ch;
3194   eol_type_t eol_type = str->eol_type;
3195
3196   while (n--)
3197     {
3198       c = *src++;
3199       if (ch)
3200         {
3201           /* Previous character was first byte of Big5 char. */
3202           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3203             {
3204               unsigned char b1, b2, b3;
3205               DECODE_BIG5 (ch, c, b1, b2, b3);
3206               Dynarr_add (dst, b1);
3207               Dynarr_add (dst, b2);
3208               Dynarr_add (dst, b3);
3209             }
3210           else
3211             {
3212               DECODE_ADD_BINARY_CHAR (ch, dst);
3213               DECODE_ADD_BINARY_CHAR (c, dst);
3214             }
3215           ch = 0;
3216         }
3217       else
3218         {
3219           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3220           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3221             ch = c;
3222           else
3223             DECODE_ADD_BINARY_CHAR (c, dst);
3224         }
3225     label_continue_loop:;
3226     }
3227
3228   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3229
3230   str->flags = flags;
3231   str->ch    = ch;
3232 }
3233
3234 /* Convert internally-formatted data to Big5. */
3235
3236 static void
3237 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3238                     unsigned_char_dynarr *dst, unsigned int n)
3239 {
3240 #ifndef UTF2000
3241   unsigned char c;
3242   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3243   unsigned int flags  = str->flags;
3244   unsigned int ch     = str->ch;
3245   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3246
3247   while (n--)
3248     {
3249       c = *src++;
3250       if (c == '\n')
3251         {
3252           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3253             Dynarr_add (dst, '\r');
3254           if (eol_type != EOL_CR)
3255             Dynarr_add (dst, '\n');
3256         }
3257       else if (BYTE_ASCII_P (c))
3258         {
3259           /* ASCII. */
3260           Dynarr_add (dst, c);
3261         }
3262       else if (BUFBYTE_LEADING_BYTE_P (c))
3263         {
3264           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3265               c == LEADING_BYTE_CHINESE_BIG5_2)
3266             {
3267               /* A recognized leading byte. */
3268               ch = c;
3269               continue; /* not done with this character. */
3270             }
3271           /* otherwise just ignore this character. */
3272         }
3273       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3274                ch == LEADING_BYTE_CHINESE_BIG5_2)
3275         {
3276           /* Previous char was a recognized leading byte. */
3277           ch = (ch << 8) | c;
3278           continue; /* not done with this character. */
3279         }
3280       else if (ch)
3281         {
3282           /* Encountering second byte of a Big5 character. */
3283           unsigned char b1, b2;
3284
3285           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3286           Dynarr_add (dst, b1);
3287           Dynarr_add (dst, b2);
3288         }
3289
3290       ch = 0;
3291     }
3292
3293   str->flags = flags;
3294   str->ch    = ch;
3295 #endif
3296 }
3297
3298
3299 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3300 Decode a Big5 character CODE of BIG5 coding-system.
3301 CODE is the character code in BIG5, a cons of two integers.
3302 Return the corresponding character.
3303 */
3304        (code))
3305 {
3306   unsigned char c1, c2, b1, b2;
3307
3308   CHECK_CONS (code);
3309   CHECK_INT (XCAR (code));
3310   CHECK_INT (XCDR (code));
3311   b1 = XINT (XCAR (code));
3312   b2 = XINT (XCDR (code));
3313   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3314       BYTE_BIG5_TWO_BYTE_2_P (b2))
3315     {
3316       Charset_ID leading_byte;
3317       Lisp_Object charset;
3318       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3319       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3320       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3321     }
3322   else
3323     return Qnil;
3324 }
3325
3326 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3327 Encode the Big5 character CH to BIG5 coding-system.
3328 Return the corresponding character code in Big5.
3329 */
3330        (ch))
3331 {
3332   Lisp_Object charset;
3333   int c1, c2, b1, b2;
3334
3335   CHECK_CHAR_COERCE_INT (ch);
3336   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3337   if (EQ (charset, Vcharset_chinese_big5_1) ||
3338       EQ (charset, Vcharset_chinese_big5_2))
3339     {
3340       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3341                    b1, b2);
3342       return Fcons (make_int (b1), make_int (b2));
3343     }
3344   else
3345     return Qnil;
3346 }
3347
3348 \f
3349 /************************************************************************/
3350 /*                           UCS-4 methods                              */
3351 /*                                                                      */
3352 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3353 /*                                                                      */
3354 /************************************************************************/
3355
3356 Lisp_Object ucs_to_mule_table[65536];
3357 Lisp_Object mule_to_ucs_table;
3358
3359 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3360 Map UCS-4 code CODE to Mule character CHARACTER.
3361
3362 Return T on success, NIL on failure.
3363 */
3364        (code, character))
3365 {
3366   unsigned int c;
3367
3368   CHECK_CHAR (character);
3369   CHECK_INT (code);
3370   c = XINT (code);
3371
3372   if (c < sizeof (ucs_to_mule_table))
3373     {
3374       ucs_to_mule_table[c] = character;
3375       return Qt;
3376     }
3377   else
3378     return Qnil;
3379 }
3380
3381 static Lisp_Object
3382 ucs_to_char (unsigned long code)
3383 {
3384   if (code < sizeof (ucs_to_mule_table))
3385     {
3386       return ucs_to_mule_table[code];
3387     }
3388   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3389     {
3390       unsigned int c;
3391
3392       code -= 0xe00000;
3393       c = code % (94 * 94);
3394       return make_char
3395         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3396                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3397                      CHARSET_LEFT_TO_RIGHT),
3398                     c / 94 + 33, c % 94 + 33));
3399     }
3400   else
3401     return Qnil;
3402 }
3403
3404 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3405 Return Mule character corresponding to UCS code CODE (a positive integer).
3406 */
3407        (code))
3408 {
3409   CHECK_NATNUM (code);
3410   return ucs_to_char (XINT (code));
3411 }
3412
3413 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3414 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3415 */
3416        (character, code))
3417 {
3418   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3419           Fset_char_ucs is more restrictive on index arg, but should
3420           check code arg in a char_table method. */
3421   CHECK_CHAR (character);
3422   CHECK_NATNUM (code);
3423   return Fput_char_table (character, code, mule_to_ucs_table);
3424 }
3425
3426 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3427 Return the UCS code (a positive integer) corresponding to CHARACTER.
3428 */
3429        (character))
3430 {
3431   return Fget_char_table (character, mule_to_ucs_table);
3432 }
3433
3434 #ifdef UTF2000
3435 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3436 #else
3437 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3438    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3439    is not found, instead.
3440    #### do something more appropriate (use blob?)
3441         Danger, Will Robinson!  Data loss.  Should we signal user? */
3442 static void
3443 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3444 {
3445   Lisp_Object chr = ucs_to_char (ch);
3446
3447   if (! NILP (chr))
3448     {
3449       Bufbyte work[MAX_EMCHAR_LEN];
3450       int len;
3451
3452       ch = XCHAR (chr);
3453       len = (ch < 128) ?
3454         simple_set_charptr_emchar (work, ch) :
3455         non_ascii_set_charptr_emchar (work, ch);
3456       Dynarr_add_many (dst, work, len);
3457     }
3458   else
3459     {
3460       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3461       Dynarr_add (dst, 34 + 128);
3462       Dynarr_add (dst, 46 + 128);
3463     }
3464 }
3465 #endif
3466
3467 static unsigned long
3468 mule_char_to_ucs4 (Lisp_Object charset,
3469                    unsigned char h, unsigned char l)
3470 {
3471   Lisp_Object code
3472     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3473                        mule_to_ucs_table);
3474
3475   if (INTP (code))
3476     {
3477       return XINT (code);
3478     }
3479   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3480             (XCHARSET_CHARS (charset) == 94) )
3481     {
3482       unsigned char final = XCHARSET_FINAL (charset);
3483
3484       if ( ('@' <= final) && (final < 0x7f) )
3485         {
3486           return 0xe00000 + (final - '@') * 94 * 94
3487             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3488         }
3489       else
3490         {
3491           return '?';
3492         }
3493     }
3494   else
3495     {
3496       return '?';
3497     }
3498 }
3499
3500 static void
3501 encode_ucs4 (Lisp_Object charset,
3502              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3503 {
3504   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3505   Dynarr_add (dst,  code >> 24);
3506   Dynarr_add (dst, (code >> 16) & 255);
3507   Dynarr_add (dst, (code >>  8) & 255);
3508   Dynarr_add (dst,  code        & 255);
3509 }
3510
3511 static int
3512 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3513                     unsigned int n)
3514 {
3515   while (n--)
3516     {
3517       int c = *src++;
3518       switch (st->ucs4.in_byte)
3519         {
3520         case 0:
3521           if (c >= 128)
3522             return 0;
3523           else
3524             st->ucs4.in_byte++;
3525           break;
3526         case 3:
3527           st->ucs4.in_byte = 0;
3528           break;
3529         default:
3530           st->ucs4.in_byte++;
3531         }
3532     }
3533   return CODING_CATEGORY_UCS4_MASK;
3534 }
3535
3536 static void
3537 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3538                     unsigned_char_dynarr *dst, unsigned int n)
3539 {
3540   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3541   unsigned int flags = str->flags;
3542   unsigned int ch    = str->ch;
3543   unsigned char counter = str->counter;
3544
3545   while (n--)
3546     {
3547       unsigned char c = *src++;
3548       switch (counter)
3549         {
3550         case 0:
3551           ch = c;
3552           counter = 3;
3553           break;
3554         case 1:
3555           decode_ucs4 ( ( ch << 8 ) | c, dst);
3556           ch = 0;
3557           counter = 0;
3558           break;
3559         default:
3560           ch = ( ch << 8 ) | c;
3561           counter--;
3562         }
3563     }
3564   if (counter & CODING_STATE_END)
3565     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3566
3567   str->flags = flags;
3568   str->ch    = ch;
3569   str->counter = counter;
3570 }
3571
3572 static void
3573 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3574                     unsigned_char_dynarr *dst, unsigned int n)
3575 {
3576 #ifndef UTF2000
3577   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3578   unsigned int flags = str->flags;
3579   unsigned int ch = str->ch;
3580   unsigned char char_boundary = str->iso2022.current_char_boundary;
3581   Lisp_Object charset = str->iso2022.current_charset;
3582
3583 #ifdef ENABLE_COMPOSITE_CHARS
3584   /* flags for handling composite chars.  We do a little switcharoo
3585      on the source while we're outputting the composite char. */
3586   unsigned int saved_n = 0;
3587   CONST unsigned char *saved_src = NULL;
3588   int in_composite = 0;
3589
3590  back_to_square_n:
3591 #endif
3592
3593   while (n--)
3594     {
3595       unsigned char c = *src++;
3596
3597       if (BYTE_ASCII_P (c))
3598         {               /* Processing ASCII character */
3599           ch = 0;
3600           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3601           char_boundary = 1;
3602         }
3603       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3604         { /* Processing Leading Byte */
3605           ch = 0;
3606           charset = CHARSET_BY_LEADING_BYTE (c);
3607           if (LEADING_BYTE_PREFIX_P(c))
3608             ch = c;
3609           char_boundary = 0;
3610         }
3611       else
3612         {                       /* Processing Non-ASCII character */
3613           char_boundary = 1;
3614           if (EQ (charset, Vcharset_control_1))
3615             {
3616               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3617             }
3618           else
3619             {
3620               switch (XCHARSET_REP_BYTES (charset))
3621                 {
3622                 case 2:
3623                   encode_ucs4 (charset, c, 0, dst);
3624                   break;
3625                 case 3:
3626                   if (XCHARSET_PRIVATE_P (charset))
3627                     {
3628                       encode_ucs4 (charset, c, 0, dst);
3629                       ch = 0;
3630                     }
3631                   else if (ch)
3632                     {
3633 #ifdef ENABLE_COMPOSITE_CHARS
3634                       if (EQ (charset, Vcharset_composite))
3635                         {
3636                           if (in_composite)
3637                             {
3638                               /* #### Bother! We don't know how to
3639                                  handle this yet. */
3640                               Dynarr_add (dst, 0);
3641                               Dynarr_add (dst, 0);
3642                               Dynarr_add (dst, 0);
3643                               Dynarr_add (dst, '~');
3644                             }
3645                           else
3646                             {
3647                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3648                                                        ch & 0x7F, c & 0x7F);
3649                               Lisp_Object lstr = composite_char_string (emch);
3650                               saved_n = n;
3651                               saved_src = src;
3652                               in_composite = 1;
3653                               src = XSTRING_DATA   (lstr);
3654                               n   = XSTRING_LENGTH (lstr);
3655                             }
3656                         }
3657                       else
3658 #endif /* ENABLE_COMPOSITE_CHARS */
3659                         {
3660                           encode_ucs4(charset, ch, c, dst);
3661                         }
3662                       ch = 0;
3663                     }
3664                   else
3665                     {
3666                       ch = c;
3667                       char_boundary = 0;
3668                     }
3669                   break;
3670                 case 4:
3671                   if (ch)
3672                     {
3673                       encode_ucs4 (charset, ch, c, dst);
3674                       ch = 0;
3675                     }
3676                   else
3677                     {
3678                       ch = c;
3679                       char_boundary = 0;
3680                     }
3681                   break;
3682                 default:
3683                   abort ();
3684                 }
3685             }
3686         }
3687     }
3688
3689 #ifdef ENABLE_COMPOSITE_CHARS
3690   if (in_composite)
3691     {
3692       n = saved_n;
3693       src = saved_src;
3694       in_composite = 0;
3695       goto back_to_square_n; /* Wheeeeeeeee ..... */
3696     }
3697 #endif /* ENABLE_COMPOSITE_CHARS */
3698
3699   str->flags = flags;
3700   str->ch = ch;
3701   str->iso2022.current_char_boundary = char_boundary;
3702   str->iso2022.current_charset = charset;
3703
3704   /* Verbum caro factum est! */
3705 #endif
3706 }
3707
3708 \f
3709 /************************************************************************/
3710 /*                           UTF-8 methods                              */
3711 /************************************************************************/
3712
3713 static int
3714 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3715                     unsigned int n)
3716 {
3717   while (n--)
3718     {
3719       unsigned char c = *src++;
3720       switch (st->utf8.in_byte)
3721         {
3722         case 0:
3723           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3724             return 0;
3725           else if (c >= 0xfc)
3726             st->utf8.in_byte = 5;
3727           else if (c >= 0xf8)
3728             st->utf8.in_byte = 4;
3729           else if (c >= 0xf0)
3730             st->utf8.in_byte = 3;
3731           else if (c >= 0xe0)
3732             st->utf8.in_byte = 2;
3733           else if (c >= 0xc0)
3734             st->utf8.in_byte = 1;
3735           else if (c >= 0x80)
3736             return 0;
3737           break;
3738         default:
3739           if ((c & 0xc0) != 0x80)
3740             return 0;
3741           else
3742             st->utf8.in_byte--;
3743         }
3744     }
3745   return CODING_CATEGORY_UTF8_MASK;
3746 }
3747
3748 static void
3749 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3750                     unsigned_char_dynarr *dst, unsigned int n)
3751 {
3752   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3753   unsigned int flags  = str->flags;
3754   unsigned int ch     = str->ch;
3755   eol_type_t eol_type = str->eol_type;
3756   unsigned char counter = str->counter;
3757
3758   while (n--)
3759     {
3760       unsigned char c = *src++;
3761       switch (counter)
3762         {
3763         case 0:
3764           if ( c >= 0xfc )
3765             {
3766               ch = c & 0x01;
3767               counter = 5;
3768             }
3769           else if ( c >= 0xf8 )
3770             {
3771               ch = c & 0x03;
3772               counter = 4;
3773             }
3774           else if ( c >= 0xf0 )
3775             {
3776               ch = c & 0x07;
3777               counter = 3;
3778             }
3779           else if ( c >= 0xe0 )
3780             {
3781               ch = c & 0x0f;
3782               counter = 2;
3783             }
3784           else if ( c >= 0xc0 )
3785             {
3786               ch = c & 0x1f;
3787               counter = 1;
3788             }
3789           else
3790             {
3791               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3792               decode_ucs4 (c, dst);
3793             }
3794           break;
3795         case 1:
3796           ch = ( ch << 6 ) | ( c & 0x3f );
3797           decode_ucs4 (ch, dst);
3798           ch = 0;
3799           counter = 0;
3800           break;
3801         default:
3802           ch = ( ch << 6 ) | ( c & 0x3f );
3803           counter--;
3804         }
3805     label_continue_loop:;
3806     }
3807
3808   if (flags & CODING_STATE_END)
3809     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3810
3811   str->flags = flags;
3812   str->ch    = ch;
3813   str->counter = counter;
3814 }
3815
3816 #ifndef UTF2000
3817 static void
3818 encode_utf8 (Lisp_Object charset,
3819              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3820 {
3821   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3822   if ( code <= 0x7f )
3823     {
3824       Dynarr_add (dst, code);
3825     }
3826   else if ( code <= 0x7ff )
3827     {
3828       Dynarr_add (dst, (code >> 6) | 0xc0);
3829       Dynarr_add (dst, (code & 0x3f) | 0x80);
3830     }
3831   else if ( code <= 0xffff )
3832     {
3833       Dynarr_add (dst,  (code >> 12) | 0xe0);
3834       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3835       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3836     }
3837   else if ( code <= 0x1fffff )
3838     {
3839       Dynarr_add (dst,  (code >> 18) | 0xf0);
3840       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3841       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3842       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3843     }
3844   else if ( code <= 0x3ffffff )
3845     {
3846       Dynarr_add (dst,  (code >> 24) | 0xf8);
3847       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3848       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3849       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3850       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3851     }
3852   else
3853     {
3854       Dynarr_add (dst,  (code >> 30) | 0xfc);
3855       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3856       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3857       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3858       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3859       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3860     }
3861 }
3862 #endif
3863
3864 static void
3865 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3866                     unsigned_char_dynarr *dst, unsigned int n)
3867 {
3868   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3869   unsigned int flags  = str->flags;
3870   unsigned int ch     = str->ch;
3871   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3872   unsigned char char_boundary = str->iso2022.current_char_boundary;
3873 #ifdef UTF2000
3874
3875   while (n--)
3876     {
3877       unsigned char c = *src++;
3878       switch (char_boundary)
3879         {
3880         case 0:
3881           if ( c >= 0xfc )
3882             {
3883               Dynarr_add (dst, c);
3884               char_boundary = 5;
3885             }
3886           else if ( c >= 0xf8 )
3887             {
3888               Dynarr_add (dst, c);
3889               char_boundary = 4;
3890             }
3891           else if ( c >= 0xf0 )
3892             {
3893               Dynarr_add (dst, c);
3894               char_boundary = 3;
3895             }
3896           else if ( c >= 0xe0 )
3897             {
3898               Dynarr_add (dst, c);
3899               char_boundary = 2;
3900             }
3901           else if ( c >= 0xc0 )
3902             {
3903               Dynarr_add (dst, c);
3904               char_boundary = 1;
3905             }
3906           else
3907             {
3908               if (c == '\n')
3909                 {
3910                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3911                     Dynarr_add (dst, '\r');
3912                   if (eol_type != EOL_CR)
3913                     Dynarr_add (dst, c);
3914                 }
3915               else
3916                 Dynarr_add (dst, c);
3917               char_boundary = 0;
3918             }
3919           break;
3920         case 1:
3921           Dynarr_add (dst, c);
3922           char_boundary = 0;
3923           break;
3924         default:
3925           Dynarr_add (dst, c);
3926           char_boundary--;
3927         }
3928     }
3929 #else /* not UTF2000 */
3930   Lisp_Object charset = str->iso2022.current_charset;
3931
3932 #ifdef ENABLE_COMPOSITE_CHARS
3933   /* flags for handling composite chars.  We do a little switcharoo
3934      on the source while we're outputting the composite char. */
3935   unsigned int saved_n = 0;
3936   CONST unsigned char *saved_src = NULL;
3937   int in_composite = 0;
3938
3939  back_to_square_n:
3940 #endif /* ENABLE_COMPOSITE_CHARS */
3941
3942   while (n--)
3943     {
3944       unsigned char c = *src++;
3945
3946       if (BYTE_ASCII_P (c))
3947         {               /* Processing ASCII character */
3948           ch = 0;
3949           if (c == '\n')
3950             {
3951               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3952                 Dynarr_add (dst, '\r');
3953               if (eol_type != EOL_CR)
3954                 Dynarr_add (dst, c);
3955             }
3956           else
3957             encode_utf8 (Vcharset_ascii, c, 0, dst);
3958           char_boundary = 1;
3959         }
3960       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3961         { /* Processing Leading Byte */
3962           ch = 0;
3963           charset = CHARSET_BY_LEADING_BYTE (c);
3964           if (LEADING_BYTE_PREFIX_P(c))
3965             ch = c;
3966           char_boundary = 0;
3967         }
3968       else
3969         {                       /* Processing Non-ASCII character */
3970           char_boundary = 1;
3971           if (EQ (charset, Vcharset_control_1))
3972             {
3973               encode_utf8 (Vcharset_control_1, c, 0, dst);
3974             }
3975           else
3976             {
3977               switch (XCHARSET_REP_BYTES (charset))
3978                 {
3979                 case 2:
3980                   encode_utf8 (charset, c, 0, dst);
3981                   break;
3982                 case 3:
3983                   if (XCHARSET_PRIVATE_P (charset))
3984                     {
3985                       encode_utf8 (charset, c, 0, dst);
3986                       ch = 0;
3987                     }
3988                   else if (ch)
3989                     {
3990 #ifdef ENABLE_COMPOSITE_CHARS
3991                       if (EQ (charset, Vcharset_composite))
3992                         {
3993                           if (in_composite)
3994                             {
3995                               /* #### Bother! We don't know how to
3996                                  handle this yet. */
3997                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
3998                             }
3999                           else
4000                             {
4001                               Emchar emch = MAKE_CHAR (Vcharset_composite,
4002                                                        ch & 0x7F, c & 0x7F);
4003                               Lisp_Object lstr = composite_char_string (emch);
4004                               saved_n = n;
4005                               saved_src = src;
4006                               in_composite = 1;
4007                               src = XSTRING_DATA   (lstr);
4008                               n   = XSTRING_LENGTH (lstr);
4009                             }
4010                         }
4011                       else
4012 #endif /* ENABLE_COMPOSITE_CHARS */
4013                         {
4014                           encode_utf8 (charset, ch, c, dst);
4015                         }
4016                       ch = 0;
4017                     }
4018                   else
4019                     {
4020                       ch = c;
4021                       char_boundary = 0;
4022                     }
4023                   break;
4024                 case 4:
4025                   if (ch)
4026                     {
4027                       encode_utf8 (charset, ch, c, dst);
4028                       ch = 0;
4029                     }
4030                   else
4031                     {
4032                       ch = c;
4033                       char_boundary = 0;
4034                     }
4035                   break;
4036                 default:
4037                   abort ();
4038                 }
4039             }
4040         }
4041     }
4042
4043 #ifdef ENABLE_COMPOSITE_CHARS
4044   if (in_composite)
4045     {
4046       n = saved_n;
4047       src = saved_src;
4048       in_composite = 0;
4049       goto back_to_square_n; /* Wheeeeeeeee ..... */
4050     }
4051 #endif
4052
4053 #endif /* not UTF2000 */
4054   str->flags = flags;
4055   str->ch    = ch;
4056   str->iso2022.current_char_boundary = char_boundary;
4057 #ifndef UTF2000
4058   str->iso2022.current_charset = charset;
4059 #endif
4060
4061   /* Verbum caro factum est! */
4062 }
4063
4064 \f
4065 /************************************************************************/
4066 /*                           ISO2022 methods                            */
4067 /************************************************************************/
4068
4069 /* The following note describes the coding system ISO2022 briefly.
4070    Since the intention of this note is to help understand the
4071    functions in this file, some parts are NOT ACCURATE or OVERLY
4072    SIMPLIFIED.  For thorough understanding, please refer to the
4073    original document of ISO2022.
4074
4075    ISO2022 provides many mechanisms to encode several character sets
4076    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4077    is encoded using bytes less than 128.  This may make the encoded
4078    text a little bit longer, but the text passes more easily through
4079    several gateways, some of which strip off MSB (Most Signigant Bit).
4080
4081    There are two kinds of character sets: control character set and
4082    graphic character set.  The former contains control characters such
4083    as `newline' and `escape' to provide control functions (control
4084    functions are also provided by escape sequences).  The latter
4085    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4086    two control character sets and many graphic character sets.
4087
4088    Graphic character sets are classified into one of the following
4089    four classes, according to the number of bytes (DIMENSION) and
4090    number of characters in one dimension (CHARS) of the set:
4091    - DIMENSION1_CHARS94
4092    - DIMENSION1_CHARS96
4093    - DIMENSION2_CHARS94
4094    - DIMENSION2_CHARS96
4095
4096    In addition, each character set is assigned an identification tag,
4097    unique for each set, called "final character" (denoted as <F>
4098    hereafter).  The <F> of each character set is decided by ECMA(*)
4099    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4100    (0x30..0x3F are for private use only).
4101
4102    Note (*): ECMA = European Computer Manufacturers Association
4103
4104    Here are examples of graphic character set [NAME(<F>)]:
4105         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4106         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4107         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4108         o DIMENSION2_CHARS96 -- none for the moment
4109
4110    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4111         C0 [0x00..0x1F] -- control character plane 0
4112         GL [0x20..0x7F] -- graphic character plane 0
4113         C1 [0x80..0x9F] -- control character plane 1
4114         GR [0xA0..0xFF] -- graphic character plane 1
4115
4116    A control character set is directly designated and invoked to C0 or
4117    C1 by an escape sequence.  The most common case is that:
4118    - ISO646's  control character set is designated/invoked to C0, and
4119    - ISO6429's control character set is designated/invoked to C1,
4120    and usually these designations/invocations are omitted in encoded
4121    text.  In a 7-bit environment, only C0 can be used, and a control
4122    character for C1 is encoded by an appropriate escape sequence to
4123    fit into the environment.  All control characters for C1 are
4124    defined to have corresponding escape sequences.
4125
4126    A graphic character set is at first designated to one of four
4127    graphic registers (G0 through G3), then these graphic registers are
4128    invoked to GL or GR.  These designations and invocations can be
4129    done independently.  The most common case is that G0 is invoked to
4130    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4131    these invocations and designations are omitted in encoded text.
4132    In a 7-bit environment, only GL can be used.
4133
4134    When a graphic character set of CHARS94 is invoked to GL, codes
4135    0x20 and 0x7F of the GL area work as control characters SPACE and
4136    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4137    be used.
4138
4139    There are two ways of invocation: locking-shift and single-shift.
4140    With locking-shift, the invocation lasts until the next different
4141    invocation, whereas with single-shift, the invocation affects the
4142    following character only and doesn't affect the locking-shift
4143    state.  Invocations are done by the following control characters or
4144    escape sequences:
4145
4146    ----------------------------------------------------------------------
4147    abbrev  function                  cntrl escape seq   description
4148    ----------------------------------------------------------------------
4149    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4150    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4151    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4152    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4153    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4154    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4155    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4156    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4157    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4158    ----------------------------------------------------------------------
4159    (*) These are not used by any known coding system.
4160
4161    Control characters for these functions are defined by macros
4162    ISO_CODE_XXX in `coding.h'.
4163
4164    Designations are done by the following escape sequences:
4165    ----------------------------------------------------------------------
4166    escape sequence      description
4167    ----------------------------------------------------------------------
4168    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4169    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4170    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4171    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4172    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4173    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4174    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4175    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4176    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4177    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4178    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4179    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4180    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4181    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4182    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4183    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4184    ----------------------------------------------------------------------
4185
4186    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4187    of dimension 1, chars 94, and final character <F>, etc...
4188
4189    Note (*): Although these designations are not allowed in ISO2022,
4190    Emacs accepts them on decoding, and produces them on encoding
4191    CHARS96 character sets in a coding system which is characterized as
4192    7-bit environment, non-locking-shift, and non-single-shift.
4193
4194    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4195    '(' can be omitted.  We refer to this as "short-form" hereafter.
4196
4197    Now you may notice that there are a lot of ways for encoding the
4198    same multilingual text in ISO2022.  Actually, there exist many
4199    coding systems such as Compound Text (used in X11's inter client
4200    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4201    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4202    localized platforms), and all of these are variants of ISO2022.
4203
4204    In addition to the above, Emacs handles two more kinds of escape
4205    sequences: ISO6429's direction specification and Emacs' private
4206    sequence for specifying character composition.
4207
4208    ISO6429's direction specification takes the following form:
4209         o CSI ']'      -- end of the current direction
4210         o CSI '0' ']'  -- end of the current direction
4211         o CSI '1' ']'  -- start of left-to-right text
4212         o CSI '2' ']'  -- start of right-to-left text
4213    The control character CSI (0x9B: control sequence introducer) is
4214    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4215
4216    Character composition specification takes the following form:
4217         o ESC '0' -- start character composition
4218         o ESC '1' -- end character composition
4219    Since these are not standard escape sequences of any ISO standard,
4220    their use with these meanings is restricted to Emacs only.  */
4221
4222 static void
4223 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4224 {
4225   int i;
4226
4227   for (i = 0; i < 4; i++)
4228     {
4229       if (!NILP (coding_system))
4230         iso->charset[i] =
4231           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4232       else
4233         iso->charset[i] = Qt;
4234       iso->invalid_designated[i] = 0;
4235     }
4236   iso->esc = ISO_ESC_NOTHING;
4237   iso->esc_bytes_index = 0;
4238   iso->register_left = 0;
4239   iso->register_right = 1;
4240   iso->switched_dir_and_no_valid_charset_yet = 0;
4241   iso->invalid_switch_dir = 0;
4242   iso->output_direction_sequence = 0;
4243   iso->output_literally = 0;
4244 #ifdef ENABLE_COMPOSITE_CHARS
4245   if (iso->composite_chars)
4246     Dynarr_reset (iso->composite_chars);
4247 #endif
4248 }
4249
4250 static int
4251 fit_to_be_escape_quoted (unsigned char c)
4252 {
4253   switch (c)
4254     {
4255     case ISO_CODE_ESC:
4256     case ISO_CODE_CSI:
4257     case ISO_CODE_SS2:
4258     case ISO_CODE_SS3:
4259     case ISO_CODE_SO:
4260     case ISO_CODE_SI:
4261       return 1;
4262
4263     default:
4264       return 0;
4265     }
4266 }
4267
4268 /* Parse one byte of an ISO2022 escape sequence.
4269    If the result is an invalid escape sequence, return 0 and
4270    do not change anything in STR.  Otherwise, if the result is
4271    an incomplete escape sequence, update ISO2022.ESC and
4272    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4273    all the state variables (but not ISO2022.ESC_BYTES) and
4274    return 1.
4275
4276    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4277    or invocation of an invalid character set and treat that as
4278    an unrecognized escape sequence. */
4279
4280 static int
4281 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4282                    unsigned char c, unsigned int *flags,
4283                    int check_invalid_charsets)
4284 {
4285   /* (1) If we're at the end of a designation sequence, CS is the
4286      charset being designated and REG is the register to designate
4287      it to.
4288
4289      (2) If we're at the end of a locking-shift sequence, REG is
4290      the register to invoke and HALF (0 == left, 1 == right) is
4291      the half to invoke it into.
4292
4293      (3) If we're at the end of a single-shift sequence, REG is
4294      the register to invoke. */
4295   Lisp_Object cs = Qnil;
4296   int reg, half;
4297
4298   /* NOTE: This code does goto's all over the fucking place.
4299      The reason for this is that we're basically implementing
4300      a state machine here, and hierarchical languages like C
4301      don't really provide a clean way of doing this. */
4302
4303   if (! (*flags & CODING_STATE_ESCAPE))
4304     /* At beginning of escape sequence; we need to reset our
4305        escape-state variables. */
4306     iso->esc = ISO_ESC_NOTHING;
4307
4308   iso->output_literally = 0;
4309   iso->output_direction_sequence = 0;
4310
4311   switch (iso->esc)
4312     {
4313     case ISO_ESC_NOTHING:
4314       iso->esc_bytes_index = 0;
4315       switch (c)
4316         {
4317         case ISO_CODE_ESC:      /* Start escape sequence */
4318           *flags |= CODING_STATE_ESCAPE;
4319           iso->esc = ISO_ESC;
4320           goto not_done;
4321
4322         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4323           *flags |= CODING_STATE_ESCAPE;
4324           iso->esc = ISO_ESC_5_11;
4325           goto not_done;
4326
4327         case ISO_CODE_SO:       /* locking shift 1 */
4328           reg = 1; half = 0;
4329           goto locking_shift;
4330         case ISO_CODE_SI:       /* locking shift 0 */
4331           reg = 0; half = 0;
4332           goto locking_shift;
4333
4334         case ISO_CODE_SS2:      /* single shift */
4335           reg = 2;
4336           goto single_shift;
4337         case ISO_CODE_SS3:      /* single shift */
4338           reg = 3;
4339           goto single_shift;
4340
4341         default:                        /* Other control characters */
4342           return 0;
4343         }
4344
4345     case ISO_ESC:
4346       switch (c)
4347         {
4348           /**** single shift ****/
4349
4350         case 'N':       /* single shift 2 */
4351           reg = 2;
4352           goto single_shift;
4353         case 'O':       /* single shift 3 */
4354           reg = 3;
4355           goto single_shift;
4356
4357           /**** locking shift ****/
4358
4359         case '~':       /* locking shift 1 right */
4360           reg = 1; half = 1;
4361           goto locking_shift;
4362         case 'n':       /* locking shift 2 */
4363           reg = 2; half = 0;
4364           goto locking_shift;
4365         case '}':       /* locking shift 2 right */
4366           reg = 2; half = 1;
4367           goto locking_shift;
4368         case 'o':       /* locking shift 3 */
4369           reg = 3; half = 0;
4370           goto locking_shift;
4371         case '|':       /* locking shift 3 right */
4372           reg = 3; half = 1;
4373           goto locking_shift;
4374
4375 #ifdef ENABLE_COMPOSITE_CHARS
4376           /**** composite ****/
4377
4378         case '0':
4379           iso->esc = ISO_ESC_START_COMPOSITE;
4380           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4381             CODING_STATE_COMPOSITE;
4382           return 1;
4383
4384         case '1':
4385           iso->esc = ISO_ESC_END_COMPOSITE;
4386           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4387             ~CODING_STATE_COMPOSITE;
4388           return 1;
4389 #endif /* ENABLE_COMPOSITE_CHARS */
4390
4391           /**** directionality ****/
4392
4393         case '[':
4394           iso->esc = ISO_ESC_5_11;
4395           goto not_done;
4396
4397           /**** designation ****/
4398
4399         case '$':       /* multibyte charset prefix */
4400           iso->esc = ISO_ESC_2_4;
4401           goto not_done;
4402
4403         default:
4404           if (0x28 <= c && c <= 0x2F)
4405             {
4406               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4407               goto not_done;
4408             }
4409
4410           /* This function is called with CODESYS equal to nil when
4411              doing coding-system detection. */
4412           if (!NILP (codesys)
4413               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4414               && fit_to_be_escape_quoted (c))
4415             {
4416               iso->esc = ISO_ESC_LITERAL;
4417               *flags &= CODING_STATE_ISO2022_LOCK;
4418               return 1;
4419             }
4420
4421           /* bzzzt! */
4422           return 0;
4423         }
4424
4425
4426
4427       /**** directionality ****/
4428
4429     case ISO_ESC_5_11:          /* ISO6429 direction control */
4430       if (c == ']')
4431         {
4432           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4433           goto directionality;
4434         }
4435       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4436       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4437       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4438       else               return 0;
4439       goto not_done;
4440
4441     case ISO_ESC_5_11_0:
4442       if (c == ']')
4443         {
4444           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4445           goto directionality;
4446         }
4447       return 0;
4448
4449     case ISO_ESC_5_11_1:
4450       if (c == ']')
4451         {
4452           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4453           goto directionality;
4454         }
4455       return 0;
4456
4457     case ISO_ESC_5_11_2:
4458       if (c == ']')
4459         {
4460           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4461           goto directionality;
4462         }
4463       return 0;
4464
4465     directionality:
4466       iso->esc = ISO_ESC_DIRECTIONALITY;
4467       /* Various junk here to attempt to preserve the direction sequences
4468          literally in the text if they would otherwise be swallowed due
4469          to invalid designations that don't show up as actual charset
4470          changes in the text. */
4471       if (iso->invalid_switch_dir)
4472         {
4473           /* We already inserted a direction switch literally into the
4474              text.  We assume (#### this may not be right) that the
4475              next direction switch is the one going the other way,
4476              and we need to output that literally as well. */
4477           iso->output_literally = 1;
4478           iso->invalid_switch_dir = 0;
4479         }
4480       else
4481         {
4482           int jj;
4483
4484           /* If we are in the thrall of an invalid designation,
4485            then stick the directionality sequence literally into the
4486            output stream so it ends up in the original text again. */
4487           for (jj = 0; jj < 4; jj++)
4488             if (iso->invalid_designated[jj])
4489               break;
4490           if (jj < 4)
4491             {
4492               iso->output_literally = 1;
4493               iso->invalid_switch_dir = 1;
4494             }
4495           else
4496             /* Indicate that we haven't yet seen a valid designation,
4497                so that if a switch-dir is directly followed by an
4498                invalid designation, both get inserted literally. */
4499             iso->switched_dir_and_no_valid_charset_yet = 1;
4500         }
4501       return 1;
4502
4503
4504       /**** designation ****/
4505
4506     case ISO_ESC_2_4:
4507       if (0x28 <= c && c <= 0x2F)
4508         {
4509           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4510           goto not_done;
4511         }
4512       if (0x40 <= c && c <= 0x42)
4513         {
4514           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4515                                       *flags & CODING_STATE_R2L ?
4516                                       CHARSET_RIGHT_TO_LEFT :
4517                                       CHARSET_LEFT_TO_RIGHT);
4518           reg = 0;
4519           goto designated;
4520         }
4521       return 0;
4522
4523     default:
4524       {
4525         int type =-1;
4526
4527         if (c < '0' || c > '~')
4528           return 0; /* bad final byte */
4529
4530         if (iso->esc >= ISO_ESC_2_8 &&
4531             iso->esc <= ISO_ESC_2_15)
4532           {
4533             type = ((iso->esc >= ISO_ESC_2_12) ?
4534                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4535             reg = (iso->esc - ISO_ESC_2_8) & 3;
4536           }
4537         else if (iso->esc >= ISO_ESC_2_4_8 &&
4538                  iso->esc <= ISO_ESC_2_4_15)
4539           {
4540             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4541                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4542             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4543           }
4544         else
4545           {
4546             /* Can this ever be reached? -slb */
4547             abort();
4548           }
4549
4550         cs = CHARSET_BY_ATTRIBUTES (type, c,
4551                                     *flags & CODING_STATE_R2L ?
4552                                     CHARSET_RIGHT_TO_LEFT :
4553                                     CHARSET_LEFT_TO_RIGHT);
4554         goto designated;
4555       }
4556     }
4557
4558  not_done:
4559   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4560   return -1;
4561
4562  single_shift:
4563   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4564     /* can't invoke something that ain't there. */
4565     return 0;
4566   iso->esc = ISO_ESC_SINGLE_SHIFT;
4567   *flags &= CODING_STATE_ISO2022_LOCK;
4568   if (reg == 2)
4569     *flags |= CODING_STATE_SS2;
4570   else
4571     *flags |= CODING_STATE_SS3;
4572   return 1;
4573
4574  locking_shift:
4575   if (check_invalid_charsets &&
4576       !CHARSETP (iso->charset[reg]))
4577     /* can't invoke something that ain't there. */
4578     return 0;
4579   if (half)
4580     iso->register_right = reg;
4581   else
4582     iso->register_left = reg;
4583   *flags &= CODING_STATE_ISO2022_LOCK;
4584   iso->esc = ISO_ESC_LOCKING_SHIFT;
4585   return 1;
4586
4587  designated:
4588   if (NILP (cs) && check_invalid_charsets)
4589     {
4590       iso->invalid_designated[reg] = 1;
4591       iso->charset[reg] = Vcharset_ascii;
4592       iso->esc = ISO_ESC_DESIGNATE;
4593       *flags &= CODING_STATE_ISO2022_LOCK;
4594       iso->output_literally = 1;
4595       if (iso->switched_dir_and_no_valid_charset_yet)
4596         {
4597           /* We encountered a switch-direction followed by an
4598              invalid designation.  Ensure that the switch-direction
4599              gets outputted; otherwise it will probably get eaten
4600              when the text is written out again. */
4601           iso->switched_dir_and_no_valid_charset_yet = 0;
4602           iso->output_direction_sequence = 1;
4603           /* And make sure that the switch-dir going the other
4604              way gets outputted, as well. */
4605           iso->invalid_switch_dir = 1;
4606         }
4607       return 1;
4608     }
4609   /* This function is called with CODESYS equal to nil when
4610      doing coding-system detection. */
4611   if (!NILP (codesys))
4612     {
4613       charset_conversion_spec_dynarr *dyn =
4614         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4615
4616       if (dyn)
4617         {
4618           int i;
4619
4620           for (i = 0; i < Dynarr_length (dyn); i++)
4621             {
4622               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4623               if (EQ (cs, spec->from_charset))
4624                 cs = spec->to_charset;
4625             }
4626         }
4627     }
4628
4629   iso->charset[reg] = cs;
4630   iso->esc = ISO_ESC_DESIGNATE;
4631   *flags &= CODING_STATE_ISO2022_LOCK;
4632   if (iso->invalid_designated[reg])
4633     {
4634       iso->invalid_designated[reg] = 0;
4635       iso->output_literally = 1;
4636     }
4637   if (iso->switched_dir_and_no_valid_charset_yet)
4638     iso->switched_dir_and_no_valid_charset_yet = 0;
4639   return 1;
4640 }
4641
4642 static int
4643 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4644                        unsigned int n)
4645 {
4646   int mask;
4647
4648   /* #### There are serious deficiencies in the recognition mechanism
4649      here.  This needs to be much smarter if it's going to cut it.
4650      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4651      it should be detected as Latin-1.
4652      All the ISO2022 stuff in this file should be synced up with the
4653      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4654      Perhaps we should wait till R2L works in FSF Emacs? */
4655
4656   if (!st->iso2022.initted)
4657     {
4658       reset_iso2022 (Qnil, &st->iso2022.iso);
4659       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4660                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4661                           CODING_CATEGORY_ISO_8_1_MASK |
4662                           CODING_CATEGORY_ISO_8_2_MASK |
4663                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4664       st->iso2022.flags = 0;
4665       st->iso2022.high_byte_count = 0;
4666       st->iso2022.saw_single_shift = 0;
4667       st->iso2022.initted = 1;
4668     }
4669
4670   mask = st->iso2022.mask;
4671
4672   while (n--)
4673     {
4674       int c = *src++;
4675       if (c >= 0xA0)
4676         {
4677           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4678           st->iso2022.high_byte_count++;
4679         }
4680       else
4681         {
4682           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4683             {
4684               if (st->iso2022.high_byte_count & 1)
4685                 /* odd number of high bytes; assume not iso-8-2 */
4686                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4687             }
4688           st->iso2022.high_byte_count = 0;
4689           st->iso2022.saw_single_shift = 0;
4690           if (c > 0x80)
4691             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4692         }
4693       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4694           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4695         { /* control chars */
4696           switch (c)
4697             {
4698               /* Allow and ignore control characters that you might
4699                  reasonably see in a text file */
4700             case '\r':
4701             case '\n':
4702             case '\t':
4703             case  7: /* bell */
4704             case  8: /* backspace */
4705             case 11: /* vertical tab */
4706             case 12: /* form feed */
4707             case 26: /* MS-DOS C-z junk */
4708             case 31: /* '^_' -- for info */
4709               goto label_continue_loop;
4710
4711             default:
4712               break;
4713             }
4714         }
4715
4716       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4717           || BYTE_C1_P (c))
4718         {
4719           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4720                                  &st->iso2022.flags, 0))
4721             {
4722               switch (st->iso2022.iso.esc)
4723                 {
4724                 case ISO_ESC_DESIGNATE:
4725                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4726                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4727                   break;
4728                 case ISO_ESC_LOCKING_SHIFT:
4729                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4730                   goto ran_out_of_chars;
4731                 case ISO_ESC_SINGLE_SHIFT:
4732                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4733                   st->iso2022.saw_single_shift = 1;
4734                   break;
4735                 default:
4736                   break;
4737                 }
4738             }
4739           else
4740             {
4741               mask = 0;
4742               goto ran_out_of_chars;
4743             }
4744         }
4745     label_continue_loop:;
4746     }
4747
4748  ran_out_of_chars:
4749
4750   return mask;
4751 }
4752
4753 static int
4754 postprocess_iso2022_mask (int mask)
4755 {
4756   /* #### kind of cheesy */
4757   /* If seven-bit ISO is allowed, then assume that the encoding is
4758      entirely seven-bit and turn off the eight-bit ones. */
4759   if (mask & CODING_CATEGORY_ISO_7_MASK)
4760     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4761                CODING_CATEGORY_ISO_8_1_MASK |
4762                CODING_CATEGORY_ISO_8_2_MASK);
4763   return mask;
4764 }
4765
4766 /* If FLAGS is a null pointer or specifies right-to-left motion,
4767    output a switch-dir-to-left-to-right sequence to DST.
4768    Also update FLAGS if it is not a null pointer.
4769    If INTERNAL_P is set, we are outputting in internal format and
4770    need to handle the CSI differently. */
4771
4772 static void
4773 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4774                                  unsigned_char_dynarr *dst,
4775                                  unsigned int *flags,
4776                                  int internal_p)
4777 {
4778   if (!flags || (*flags & CODING_STATE_R2L))
4779     {
4780       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4781         {
4782           Dynarr_add (dst, ISO_CODE_ESC);
4783           Dynarr_add (dst, '[');
4784         }
4785       else if (internal_p)
4786         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4787       else
4788         Dynarr_add (dst, ISO_CODE_CSI);
4789       Dynarr_add (dst, '0');
4790       Dynarr_add (dst, ']');
4791       if (flags)
4792         *flags &= ~CODING_STATE_R2L;
4793     }
4794 }
4795
4796 /* If FLAGS is a null pointer or specifies a direction different from
4797    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4798    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4799    sequence to DST.  Also update FLAGS if it is not a null pointer.
4800    If INTERNAL_P is set, we are outputting in internal format and
4801    need to handle the CSI differently. */
4802
4803 static void
4804 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4805                           unsigned_char_dynarr *dst, unsigned int *flags,
4806                           int internal_p)
4807 {
4808   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4809       direction == CHARSET_LEFT_TO_RIGHT)
4810     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4811   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4812            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4813            direction == CHARSET_RIGHT_TO_LEFT)
4814     {
4815       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4816         {
4817           Dynarr_add (dst, ISO_CODE_ESC);
4818           Dynarr_add (dst, '[');
4819         }
4820       else if (internal_p)
4821         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4822       else
4823         Dynarr_add (dst, ISO_CODE_CSI);
4824       Dynarr_add (dst, '2');
4825       Dynarr_add (dst, ']');
4826       if (flags)
4827         *flags |= CODING_STATE_R2L;
4828     }
4829 }
4830
4831 /* Convert ISO2022-format data to internal format. */
4832
4833 static void
4834 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4835                        unsigned_char_dynarr *dst, unsigned int n)
4836 {
4837   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4838   unsigned int flags  = str->flags;
4839   unsigned int ch     = str->ch;
4840   eol_type_t eol_type = str->eol_type;
4841 #ifdef ENABLE_COMPOSITE_CHARS
4842   unsigned_char_dynarr *real_dst = dst;
4843 #endif
4844   Lisp_Object coding_system;
4845
4846   XSETCODING_SYSTEM (coding_system, str->codesys);
4847
4848 #ifdef ENABLE_COMPOSITE_CHARS
4849   if (flags & CODING_STATE_COMPOSITE)
4850     dst = str->iso2022.composite_chars;
4851 #endif /* ENABLE_COMPOSITE_CHARS */
4852
4853   while (n--)
4854     {
4855       unsigned char c = *src++;
4856       if (flags & CODING_STATE_ESCAPE)
4857         {       /* Within ESC sequence */
4858           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4859                                           c, &flags, 1);
4860
4861           if (retval)
4862             {
4863               switch (str->iso2022.esc)
4864                 {
4865 #ifdef ENABLE_COMPOSITE_CHARS
4866                 case ISO_ESC_START_COMPOSITE:
4867                   if (str->iso2022.composite_chars)
4868                     Dynarr_reset (str->iso2022.composite_chars);
4869                   else
4870                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4871                   dst = str->iso2022.composite_chars;
4872                   break;
4873                 case ISO_ESC_END_COMPOSITE:
4874                   {
4875                     Bufbyte comstr[MAX_EMCHAR_LEN];
4876                     Bytecount len;
4877                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4878                                                          Dynarr_length (dst));
4879                     dst = real_dst;
4880                     len = set_charptr_emchar (comstr, emch);
4881                     Dynarr_add_many (dst, comstr, len);
4882                     break;
4883                   }
4884 #endif /* ENABLE_COMPOSITE_CHARS */
4885
4886                 case ISO_ESC_LITERAL:
4887                   DECODE_ADD_BINARY_CHAR (c, dst);
4888                   break;
4889
4890                 default:
4891                   /* Everything else handled already */
4892                   break;
4893                 }
4894             }
4895
4896           /* Attempted error recovery. */
4897           if (str->iso2022.output_direction_sequence)
4898             ensure_correct_direction (flags & CODING_STATE_R2L ?
4899                                       CHARSET_RIGHT_TO_LEFT :
4900                                       CHARSET_LEFT_TO_RIGHT,
4901                                       str->codesys, dst, 0, 1);
4902           /* More error recovery. */
4903           if (!retval || str->iso2022.output_literally)
4904             {
4905               /* Output the (possibly invalid) sequence */
4906               int i;
4907               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4908                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4909               flags &= CODING_STATE_ISO2022_LOCK;
4910               if (!retval)
4911                 n++, src--;/* Repeat the loop with the same character. */
4912               else
4913                 {
4914                   /* No sense in reprocessing the final byte of the
4915                      escape sequence; it could mess things up anyway.
4916                      Just add it now. */
4917                   DECODE_ADD_BINARY_CHAR (c, dst);
4918                 }
4919             }
4920           ch = 0;
4921         }
4922       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4923         { /* Control characters */
4924
4925           /***** Error-handling *****/
4926
4927           /* If we were in the middle of a character, dump out the
4928              partial character. */
4929           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4930
4931           /* If we just saw a single-shift character, dump it out.
4932              This may dump out the wrong sort of single-shift character,
4933              but least it will give an indication that something went
4934              wrong. */
4935           if (flags & CODING_STATE_SS2)
4936             {
4937               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4938               flags &= ~CODING_STATE_SS2;
4939             }
4940           if (flags & CODING_STATE_SS3)
4941             {
4942               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4943               flags &= ~CODING_STATE_SS3;
4944             }
4945
4946           /***** Now handle the control characters. *****/
4947
4948           /* Handle CR/LF */
4949           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4950
4951           flags &= CODING_STATE_ISO2022_LOCK;
4952
4953           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4954             DECODE_ADD_BINARY_CHAR (c, dst);
4955         }
4956       else
4957         {                       /* Graphic characters */
4958           Lisp_Object charset;
4959 #ifndef UTF2000
4960           Charset_ID lb;
4961 #endif
4962           int reg;
4963
4964           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4965
4966           /* Now determine the charset. */
4967           reg = ((flags & CODING_STATE_SS2) ? 2
4968                  : (flags & CODING_STATE_SS3) ? 3
4969                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4970                  : str->iso2022.register_left);
4971           charset = str->iso2022.charset[reg];
4972
4973           /* Error checking: */
4974           if (! CHARSETP (charset)
4975               || str->iso2022.invalid_designated[reg]
4976               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4977                   && XCHARSET_CHARS (charset) == 94))
4978             /* Mrmph.  We are trying to invoke a register that has no
4979                or an invalid charset in it, or trying to add a character
4980                outside the range of the charset.  Insert that char literally
4981                to preserve it for the output. */
4982             {
4983               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4984               DECODE_ADD_BINARY_CHAR (c, dst);
4985             }
4986
4987           else
4988             {
4989               /* Things are probably hunky-dorey. */
4990
4991               /* Fetch reverse charset, maybe. */
4992               if (((flags & CODING_STATE_R2L) &&
4993                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4994                   ||
4995                   (!(flags & CODING_STATE_R2L) &&
4996                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4997                 {
4998                   Lisp_Object new_charset =
4999                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5000                   if (!NILP (new_charset))
5001                     charset = new_charset;
5002                 }
5003
5004 #ifdef UTF2000
5005               if (XCHARSET_DIMENSION (charset) == 1)
5006                 {
5007                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5008                   DECODE_ADD_UCS_CHAR
5009                     (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5010                 }
5011               else if (ch)
5012                 {
5013                   DECODE_ADD_UCS_CHAR
5014                     (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5015                   ch = 0;
5016                 }
5017               else
5018                 ch = c;
5019 #else
5020               lb = XCHARSET_LEADING_BYTE (charset);
5021               switch (XCHARSET_REP_BYTES (charset))
5022                 {
5023                 case 1: /* ASCII */
5024                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5025                   Dynarr_add (dst, c & 0x7F);
5026                   break;
5027
5028                 case 2: /* one-byte official */
5029                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5030                   Dynarr_add (dst, lb);
5031                   Dynarr_add (dst, c | 0x80);
5032                   break;
5033
5034                 case 3: /* one-byte private or two-byte official */
5035                   if (XCHARSET_PRIVATE_P (charset))
5036                     {
5037                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5038                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5039                       Dynarr_add (dst, lb);
5040                       Dynarr_add (dst, c | 0x80);
5041                     }
5042                   else
5043                     {
5044                       if (ch)
5045                         {
5046                           Dynarr_add (dst, lb);
5047                           Dynarr_add (dst, ch | 0x80);
5048                           Dynarr_add (dst, c | 0x80);
5049                           ch = 0;
5050                         }
5051                       else
5052                         ch = c;
5053                     }
5054                   break;
5055
5056                 default:        /* two-byte private */
5057                   if (ch)
5058                     {
5059                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5060                       Dynarr_add (dst, lb);
5061                       Dynarr_add (dst, ch | 0x80);
5062                       Dynarr_add (dst, c | 0x80);
5063                       ch = 0;
5064                     }
5065                   else
5066                     ch = c;
5067                 }
5068 #endif
5069             }
5070
5071           if (!ch)
5072             flags &= CODING_STATE_ISO2022_LOCK;
5073         }
5074
5075     label_continue_loop:;
5076     }
5077
5078   if (flags & CODING_STATE_END)
5079     DECODE_OUTPUT_PARTIAL_CHAR (ch);
5080
5081   str->flags = flags;
5082   str->ch    = ch;
5083 }
5084
5085
5086 /***** ISO2022 encoder *****/
5087
5088 /* Designate CHARSET into register REG. */
5089
5090 static void
5091 iso2022_designate (Lisp_Object charset, unsigned char reg,
5092                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5093 {
5094   static CONST char inter94[] = "()*+";
5095   static CONST char inter96[] = ",-./";
5096   unsigned int type;
5097   unsigned char final;
5098   Lisp_Object old_charset = str->iso2022.charset[reg];
5099
5100   str->iso2022.charset[reg] = charset;
5101   if (!CHARSETP (charset))
5102     /* charset might be an initial nil or t. */
5103     return;
5104   type = XCHARSET_TYPE (charset);
5105   final = XCHARSET_FINAL (charset);
5106   if (!str->iso2022.force_charset_on_output[reg] &&
5107       CHARSETP (old_charset) &&
5108       XCHARSET_TYPE (old_charset) == type &&
5109       XCHARSET_FINAL (old_charset) == final)
5110     return;
5111
5112   str->iso2022.force_charset_on_output[reg] = 0;
5113
5114   {
5115     charset_conversion_spec_dynarr *dyn =
5116       str->codesys->iso2022.output_conv;
5117
5118     if (dyn)
5119       {
5120         int i;
5121
5122         for (i = 0; i < Dynarr_length (dyn); i++)
5123           {
5124             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5125             if (EQ (charset, spec->from_charset))
5126                 charset = spec->to_charset;
5127           }
5128       }
5129   }
5130
5131   Dynarr_add (dst, ISO_CODE_ESC);
5132   switch (type)
5133     {
5134     case CHARSET_TYPE_94:
5135       Dynarr_add (dst, inter94[reg]);
5136       break;
5137     case CHARSET_TYPE_96:
5138       Dynarr_add (dst, inter96[reg]);
5139       break;
5140     case CHARSET_TYPE_94X94:
5141       Dynarr_add (dst, '$');
5142       if (reg != 0
5143           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5144           || final < '@'
5145           || final > 'B')
5146         Dynarr_add (dst, inter94[reg]);
5147       break;
5148     case CHARSET_TYPE_96X96:
5149       Dynarr_add (dst, '$');
5150       Dynarr_add (dst, inter96[reg]);
5151       break;
5152     }
5153   Dynarr_add (dst, final);
5154 }
5155
5156 static void
5157 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5158 {
5159   if (str->iso2022.register_left != 0)
5160     {
5161       Dynarr_add (dst, ISO_CODE_SI);
5162       str->iso2022.register_left = 0;
5163     }
5164 }
5165
5166 static void
5167 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5168 {
5169   if (str->iso2022.register_left != 1)
5170     {
5171       Dynarr_add (dst, ISO_CODE_SO);
5172       str->iso2022.register_left = 1;
5173     }
5174 }
5175
5176 /* Convert internally-formatted data to ISO2022 format. */
5177
5178 static void
5179 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5180                        unsigned_char_dynarr *dst, unsigned int n)
5181 {
5182   unsigned char charmask, c;
5183   unsigned char char_boundary;
5184   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5185   unsigned int flags          = str->flags;
5186   Emchar ch                   = str->ch;
5187   Lisp_Coding_System *codesys = str->codesys;
5188   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5189   int i;
5190   Lisp_Object charset;
5191   int half;
5192 #ifdef UTF2000
5193   unsigned int byte1, byte2;
5194 #endif
5195
5196 #ifdef ENABLE_COMPOSITE_CHARS
5197   /* flags for handling composite chars.  We do a little switcharoo
5198      on the source while we're outputting the composite char. */
5199   unsigned int saved_n = 0;
5200   CONST unsigned char *saved_src = NULL;
5201   int in_composite = 0;
5202 #endif /* ENABLE_COMPOSITE_CHARS */
5203
5204   char_boundary = str->iso2022.current_char_boundary;
5205   charset = str->iso2022.current_charset;
5206   half = str->iso2022.current_half;
5207
5208 #ifdef ENABLE_COMPOSITE_CHARS
5209  back_to_square_n:
5210 #endif
5211 #ifdef UTF2000
5212   while (n--)
5213     {
5214       c = *src++;
5215
5216       switch (char_boundary)
5217         {
5218         case 0:
5219           if ( c >= 0xfc )
5220             {
5221               ch = c & 0x01;
5222               char_boundary = 5;
5223             }
5224           else if ( c >= 0xf8 )
5225             {
5226               ch = c & 0x03;
5227               char_boundary = 4;
5228             }
5229           else if ( c >= 0xf0 )
5230             {
5231               ch = c & 0x07;
5232               char_boundary = 3;
5233             }
5234           else if ( c >= 0xe0 )
5235             {
5236               ch = c & 0x0f;
5237               char_boundary = 2;
5238             }
5239           else if ( c >= 0xc0 )
5240             {
5241               ch = c & 0x1f;
5242               char_boundary = 1;
5243             }
5244           else
5245             {
5246               ch = 0;
5247
5248               restore_left_to_right_direction (codesys, dst, &flags, 0);
5249
5250               /* Make sure G0 contains ASCII */
5251               if ((c > ' ' && c < ISO_CODE_DEL) ||
5252                   !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5253                 {
5254                   ensure_normal_shift (str, dst);
5255                   iso2022_designate (Vcharset_ascii, 0, str, dst);
5256                 }
5257
5258               /* If necessary, restore everything to the default state
5259                  at end-of-line */
5260               if (c == '\n' &&
5261                   !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5262                 {
5263                   restore_left_to_right_direction (codesys, dst, &flags, 0);
5264
5265                   ensure_normal_shift (str, dst);
5266
5267                   for (i = 0; i < 4; i++)
5268                     {
5269                       Lisp_Object initial_charset =
5270                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5271                       iso2022_designate (initial_charset, i, str, dst);
5272                     }
5273                 }
5274               if (c == '\n')
5275                 {
5276                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5277                     Dynarr_add (dst, '\r');
5278                   if (eol_type != EOL_CR)
5279                     Dynarr_add (dst, c);
5280                 }
5281               else
5282                 {
5283                   if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5284                       && fit_to_be_escape_quoted (c))
5285                     Dynarr_add (dst, ISO_CODE_ESC);
5286                   Dynarr_add (dst, c);
5287                 }
5288               char_boundary = 0;
5289             }
5290           break;
5291         case 1:
5292           ch = ( ch << 6 ) | ( c & 0x3f );
5293
5294           char_boundary = 0;
5295           if ( (0x80 <= ch) && (ch <= 0x9f) )
5296             {
5297               charmask = (half == 0 ? 0x00 : 0x80);
5298
5299               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5300                   && fit_to_be_escape_quoted (ch))
5301                 Dynarr_add (dst, ISO_CODE_ESC);
5302               /* you asked for it ... */
5303               Dynarr_add (dst, ch);
5304             }
5305           else
5306             {
5307               int reg;
5308
5309               BREAKUP_CHAR (ch, charset, byte1, byte2);
5310               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5311                                         codesys, dst, &flags, 0);
5312
5313               /* Now determine which register to use. */
5314               reg = -1;
5315               for (i = 0; i < 4; i++)
5316                 {
5317                   if (EQ (charset, str->iso2022.charset[i]) ||
5318                       EQ (charset,
5319                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5320                     {
5321                       reg = i;
5322                       break;
5323                     }
5324                 }
5325
5326               if (reg == -1)
5327                 {
5328                   if (XCHARSET_GRAPHIC (charset) != 0)
5329                     {
5330                       if (!NILP (str->iso2022.charset[1]) &&
5331                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5332                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5333                         reg = 1;
5334                       else if (!NILP (str->iso2022.charset[2]))
5335                         reg = 2;
5336                       else if (!NILP (str->iso2022.charset[3]))
5337                         reg = 3;
5338                       else
5339                         reg = 0;
5340                     }
5341                   else
5342                     reg = 0;
5343                 }
5344
5345               iso2022_designate (charset, reg, str, dst);
5346
5347               /* Now invoke that register. */
5348               switch (reg)
5349                 {
5350                 case 0:
5351                   ensure_normal_shift (str, dst);
5352                   half = 0;
5353                   break;
5354
5355                 case 1:
5356                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5357                     {
5358                       ensure_shift_out (str, dst);
5359                       half = 0;
5360                     }
5361                   else
5362                     half = 1;
5363                   break;
5364
5365                 case 2:
5366                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5367                     {
5368                       Dynarr_add (dst, ISO_CODE_ESC);
5369                       Dynarr_add (dst, 'N');
5370                       half = 0;
5371                     }
5372                   else
5373                     {
5374                       Dynarr_add (dst, ISO_CODE_SS2);
5375                       half = 1;
5376                     }
5377                   break;
5378
5379                 case 3:
5380                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5381                     {
5382                       Dynarr_add (dst, ISO_CODE_ESC);
5383                       Dynarr_add (dst, 'O');
5384                       half = 0;
5385                     }
5386                   else
5387                     {
5388                       Dynarr_add (dst, ISO_CODE_SS3);
5389                       half = 1;
5390                     }
5391                   break;
5392
5393                 default:
5394                   abort ();
5395                 }
5396
5397               charmask = (half == 0 ? 0x00 : 0x80);
5398
5399               switch (XCHARSET_DIMENSION (charset))
5400                 {
5401                 case 1:
5402                   Dynarr_add (dst, byte1 | charmask);
5403                   break;
5404                 case 2:
5405                   Dynarr_add (dst, byte1 | charmask);
5406                   Dynarr_add (dst, byte2 | charmask);
5407                   break;
5408                 default:
5409                   abort ();
5410                 }
5411             }
5412           ch =0;
5413           break;
5414         default:
5415           ch = ( ch << 6 ) | ( c & 0x3f );
5416           char_boundary--;
5417         }
5418     }
5419 #else /* not UTF2000 */
5420
5421   while (n--)
5422     {
5423       c = *src++;
5424
5425       if (BYTE_ASCII_P (c))
5426         {               /* Processing ASCII character */
5427           ch = 0;
5428
5429           restore_left_to_right_direction (codesys, dst, &flags, 0);
5430
5431           /* Make sure G0 contains ASCII */
5432           if ((c > ' ' && c < ISO_CODE_DEL) ||
5433               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5434             {
5435               ensure_normal_shift (str, dst);
5436               iso2022_designate (Vcharset_ascii, 0, str, dst);
5437             }
5438
5439           /* If necessary, restore everything to the default state
5440              at end-of-line */
5441           if (c == '\n' &&
5442               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5443             {
5444               restore_left_to_right_direction (codesys, dst, &flags, 0);
5445
5446               ensure_normal_shift (str, dst);
5447
5448               for (i = 0; i < 4; i++)
5449                 {
5450                   Lisp_Object initial_charset =
5451                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5452                   iso2022_designate (initial_charset, i, str, dst);
5453                 }
5454             }
5455           if (c == '\n')
5456             {
5457               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5458                 Dynarr_add (dst, '\r');
5459               if (eol_type != EOL_CR)
5460                 Dynarr_add (dst, c);
5461             }
5462           else
5463             {
5464               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5465                   && fit_to_be_escape_quoted (c))
5466                 Dynarr_add (dst, ISO_CODE_ESC);
5467               Dynarr_add (dst, c);
5468             }
5469           char_boundary = 1;
5470         }
5471
5472       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5473         { /* Processing Leading Byte */
5474           ch = 0;
5475           charset = CHARSET_BY_LEADING_BYTE (c);
5476           if (LEADING_BYTE_PREFIX_P(c))
5477             ch = c;
5478           else if (!EQ (charset, Vcharset_control_1)
5479 #ifdef ENABLE_COMPOSITE_CHARS
5480                    && !EQ (charset, Vcharset_composite)
5481 #endif
5482                    )
5483             {
5484               int reg;
5485
5486               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5487                                         codesys, dst, &flags, 0);
5488
5489               /* Now determine which register to use. */
5490               reg = -1;
5491               for (i = 0; i < 4; i++)
5492                 {
5493                   if (EQ (charset, str->iso2022.charset[i]) ||
5494                       EQ (charset,
5495                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5496                     {
5497                       reg = i;
5498                       break;
5499                     }
5500                 }
5501
5502               if (reg == -1)
5503                 {
5504                   if (XCHARSET_GRAPHIC (charset) != 0)
5505                     {
5506                       if (!NILP (str->iso2022.charset[1]) &&
5507                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5508                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5509                         reg = 1;
5510                       else if (!NILP (str->iso2022.charset[2]))
5511                         reg = 2;
5512                       else if (!NILP (str->iso2022.charset[3]))
5513                         reg = 3;
5514                       else
5515                         reg = 0;
5516                     }
5517                   else
5518                     reg = 0;
5519                 }
5520
5521               iso2022_designate (charset, reg, str, dst);
5522
5523               /* Now invoke that register. */
5524               switch (reg)
5525                 {
5526                 case 0:
5527                   ensure_normal_shift (str, dst);
5528                   half = 0;
5529                   break;
5530
5531                 case 1:
5532                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5533                     {
5534                       ensure_shift_out (str, dst);
5535                       half = 0;
5536                     }
5537                   else
5538                     half = 1;
5539                   break;
5540
5541                 case 2:
5542                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5543                     {
5544                       Dynarr_add (dst, ISO_CODE_ESC);
5545                       Dynarr_add (dst, 'N');
5546                       half = 0;
5547                     }
5548                   else
5549                     {
5550                       Dynarr_add (dst, ISO_CODE_SS2);
5551                       half = 1;
5552                     }
5553                   break;
5554
5555                 case 3:
5556                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5557                     {
5558                       Dynarr_add (dst, ISO_CODE_ESC);
5559                       Dynarr_add (dst, 'O');
5560                       half = 0;
5561                     }
5562                   else
5563                     {
5564                       Dynarr_add (dst, ISO_CODE_SS3);
5565                       half = 1;
5566                     }
5567                   break;
5568
5569                 default:
5570                   abort ();
5571                 }
5572             }
5573           char_boundary = 0;
5574         }
5575       else
5576         {                       /* Processing Non-ASCII character */
5577           charmask = (half == 0 ? 0x7F : 0xFF);
5578           char_boundary = 1;
5579           if (EQ (charset, Vcharset_control_1))
5580             {
5581               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5582                   && fit_to_be_escape_quoted (c))
5583                 Dynarr_add (dst, ISO_CODE_ESC);
5584               /* you asked for it ... */
5585               Dynarr_add (dst, c - 0x20);
5586             }
5587           else
5588             {
5589               switch (XCHARSET_REP_BYTES (charset))
5590                 {
5591                 case 2:
5592                   Dynarr_add (dst, c & charmask);
5593                   break;
5594                 case 3:
5595                   if (XCHARSET_PRIVATE_P (charset))
5596                     {
5597                       Dynarr_add (dst, c & charmask);
5598                       ch = 0;
5599                     }
5600                   else if (ch)
5601                     {
5602 #ifdef ENABLE_COMPOSITE_CHARS
5603                       if (EQ (charset, Vcharset_composite))
5604                         {
5605                           if (in_composite)
5606                             {
5607                               /* #### Bother! We don't know how to
5608                                  handle this yet. */
5609                               Dynarr_add (dst, '~');
5610                             }
5611                           else
5612                             {
5613                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5614                                                        ch & 0x7F, c & 0x7F);
5615                               Lisp_Object lstr = composite_char_string (emch);
5616                               saved_n = n;
5617                               saved_src = src;
5618                               in_composite = 1;
5619                               src = XSTRING_DATA   (lstr);
5620                               n   = XSTRING_LENGTH (lstr);
5621                               Dynarr_add (dst, ISO_CODE_ESC);
5622                               Dynarr_add (dst, '0'); /* start composing */
5623                             }
5624                         }
5625                       else
5626 #endif /* ENABLE_COMPOSITE_CHARS */
5627                         {
5628                           Dynarr_add (dst, ch & charmask);
5629                           Dynarr_add (dst, c & charmask);
5630                         }
5631                       ch = 0;
5632                     }
5633                   else
5634                     {
5635                       ch = c;
5636                       char_boundary = 0;
5637                     }
5638                   break;
5639                 case 4:
5640                   if (ch)
5641                     {
5642                       Dynarr_add (dst, ch & charmask);
5643                       Dynarr_add (dst, c & charmask);
5644                       ch = 0;
5645                     }
5646                   else
5647                     {
5648                       ch = c;
5649                       char_boundary = 0;
5650                     }
5651                   break;
5652                 default:
5653                   abort ();
5654                 }
5655             }
5656         }
5657     }
5658 #endif /* not UTF2000 */
5659
5660 #ifdef ENABLE_COMPOSITE_CHARS
5661   if (in_composite)
5662     {
5663       n = saved_n;
5664       src = saved_src;
5665       in_composite = 0;
5666       Dynarr_add (dst, ISO_CODE_ESC);
5667       Dynarr_add (dst, '1'); /* end composing */
5668       goto back_to_square_n; /* Wheeeeeeeee ..... */
5669     }
5670 #endif /* ENABLE_COMPOSITE_CHARS */
5671
5672 #ifdef UTF2000
5673   if ( (char_boundary == 0) && flags & CODING_STATE_END)
5674 #else
5675   if (char_boundary && flags & CODING_STATE_END)
5676 #endif
5677     {
5678       restore_left_to_right_direction (codesys, dst, &flags, 0);
5679       ensure_normal_shift (str, dst);
5680       for (i = 0; i < 4; i++)
5681         {
5682           Lisp_Object initial_charset =
5683             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5684           iso2022_designate (initial_charset, i, str, dst);
5685         }
5686     }
5687
5688   str->flags = flags;
5689   str->ch    = ch;
5690   str->iso2022.current_char_boundary = char_boundary;
5691   str->iso2022.current_charset = charset;
5692   str->iso2022.current_half = half;
5693
5694   /* Verbum caro factum est! */
5695 }
5696 #endif /* MULE */
5697 \f
5698 /************************************************************************/
5699 /*                     No-conversion methods                            */
5700 /************************************************************************/
5701
5702 /* This is used when reading in "binary" files -- i.e. files that may
5703    contain all 256 possible byte values and that are not to be
5704    interpreted as being in any particular decoding. */
5705 static void
5706 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5707                              unsigned_char_dynarr *dst, unsigned int n)
5708 {
5709   unsigned char c;
5710   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5711   unsigned int flags  = str->flags;
5712   unsigned int ch     = str->ch;
5713   eol_type_t eol_type = str->eol_type;
5714
5715   while (n--)
5716     {
5717       c = *src++;
5718
5719       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5720       DECODE_ADD_BINARY_CHAR (c, dst);
5721     label_continue_loop:;
5722     }
5723
5724   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5725
5726   str->flags = flags;
5727   str->ch    = ch;
5728 }
5729
5730 static void
5731 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5732                              unsigned_char_dynarr *dst, unsigned int n)
5733 {
5734   unsigned char c;
5735   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5736   unsigned int flags  = str->flags;
5737   unsigned int ch     = str->ch;
5738   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5739 #ifdef UTF2000
5740   unsigned char char_boundary = str->iso2022.current_char_boundary;
5741 #endif
5742
5743   while (n--)
5744     {
5745       c = *src++;
5746 #ifdef UTF2000
5747       switch (char_boundary)
5748         {
5749         case 0:
5750           if ( c >= 0xfc )
5751             {
5752               ch = c & 0x01;
5753               char_boundary = 5;
5754             }
5755           else if ( c >= 0xf8 )
5756             {
5757               ch = c & 0x03;
5758               char_boundary = 4;
5759             }
5760           else if ( c >= 0xf0 )
5761             {
5762               ch = c & 0x07;
5763               char_boundary = 3;
5764             }
5765           else if ( c >= 0xe0 )
5766             {
5767               ch = c & 0x0f;
5768               char_boundary = 2;
5769             }
5770           else if ( c >= 0xc0 )
5771             {
5772               ch = c & 0x1f;
5773               char_boundary = 1;
5774             }
5775           else
5776             {
5777               ch = 0;
5778
5779               if (c == '\n')
5780                 {
5781                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5782                     Dynarr_add (dst, '\r');
5783                   if (eol_type != EOL_CR)
5784                     Dynarr_add (dst, c);
5785                 }
5786               else
5787                 Dynarr_add (dst, c);
5788               char_boundary = 0;
5789             }
5790           break;
5791         case 1:
5792           ch = ( ch << 6 ) | ( c & 0x3f );
5793           Dynarr_add (dst, ch & 0xff);
5794           char_boundary = 0;
5795           break;
5796         default:
5797           ch = ( ch << 6 ) | ( c & 0x3f );
5798           char_boundary--;
5799         }
5800 #else /* not UTF2000 */
5801       if (c == '\n')
5802         {
5803           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5804             Dynarr_add (dst, '\r');
5805           if (eol_type != EOL_CR)
5806             Dynarr_add (dst, '\n');
5807           ch = 0;
5808         }
5809       else if (BYTE_ASCII_P (c))
5810         {
5811           assert (ch == 0);
5812           Dynarr_add (dst, c);
5813         }
5814       else if (BUFBYTE_LEADING_BYTE_P (c))
5815         {
5816           assert (ch == 0);
5817           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5818               c == LEADING_BYTE_CONTROL_1)
5819             ch = c;
5820           else
5821             Dynarr_add (dst, '~'); /* untranslatable character */
5822         }
5823       else
5824         {
5825           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5826             Dynarr_add (dst, c);
5827           else if (ch == LEADING_BYTE_CONTROL_1)
5828             {
5829               assert (c < 0xC0);
5830               Dynarr_add (dst, c - 0x20);
5831             }
5832           /* else it should be the second or third byte of an
5833              untranslatable character, so ignore it */
5834           ch = 0;
5835         }
5836 #endif /* not UTF2000 */
5837     }
5838
5839   str->flags = flags;
5840   str->ch    = ch;
5841 #ifdef UTF2000
5842   str->iso2022.current_char_boundary = char_boundary;
5843 #endif
5844 }
5845
5846 \f
5847 /************************************************************************/
5848 /*                   Simple internal/external functions                 */
5849 /************************************************************************/
5850
5851 static Extbyte_dynarr *conversion_out_dynarr;
5852 static Bufbyte_dynarr *conversion_in_dynarr;
5853
5854 /* Determine coding system from coding format */
5855
5856 /* #### not correct for all values of `fmt'! */
5857 static Lisp_Object
5858 external_data_format_to_coding_system (enum external_data_format fmt)
5859 {
5860   switch (fmt)
5861     {
5862     case FORMAT_FILENAME:
5863     case FORMAT_TERMINAL:
5864       if (EQ (Vfile_name_coding_system, Qnil) ||
5865           EQ (Vfile_name_coding_system, Qbinary))
5866         return Qnil;
5867       else
5868         return Fget_coding_system (Vfile_name_coding_system);
5869 #ifdef MULE
5870     case FORMAT_CTEXT:
5871       return Fget_coding_system (Qctext);
5872 #endif
5873     default:
5874       return Qnil;
5875     }
5876 }
5877
5878 Extbyte *
5879 convert_to_external_format (CONST Bufbyte *ptr,
5880                             Bytecount len,
5881                             Extcount *len_out,
5882                             enum external_data_format fmt)
5883 {
5884   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5885
5886   if (!conversion_out_dynarr)
5887     conversion_out_dynarr = Dynarr_new (Extbyte);
5888   else
5889     Dynarr_reset (conversion_out_dynarr);
5890
5891   if (NILP (coding_system))
5892     {
5893       CONST Bufbyte *end = ptr + len;
5894
5895       for (; ptr < end;)
5896         {
5897 #ifdef UTF2000
5898           Bufbyte c =
5899             (*ptr < 0xc0) ? *ptr :
5900             ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5901 #else
5902           Bufbyte c =
5903             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5904             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5905             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5906             '~';
5907 #endif
5908           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5909           INC_CHARPTR (ptr);
5910         }
5911
5912 #ifdef ERROR_CHECK_BUFPOS
5913       assert (ptr == end);
5914 #endif
5915     }
5916   else
5917     {
5918       Lisp_Object instream, outstream, da_outstream;
5919       Lstream *istr, *ostr;
5920       struct gcpro gcpro1, gcpro2, gcpro3;
5921       char tempbuf[1024]; /* some random amount */
5922
5923       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5924       da_outstream = make_dynarr_output_stream
5925         ((unsigned_char_dynarr *) conversion_out_dynarr);
5926       outstream =
5927         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5928       istr = XLSTREAM (instream);
5929       ostr = XLSTREAM (outstream);
5930       GCPRO3 (instream, outstream, da_outstream);
5931       while (1)
5932         {
5933           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5934           if (!size_in_bytes)
5935             break;
5936           Lstream_write (ostr, tempbuf, size_in_bytes);
5937         }
5938       Lstream_close (istr);
5939       Lstream_close (ostr);
5940       UNGCPRO;
5941       Lstream_delete (istr);
5942       Lstream_delete (ostr);
5943       Lstream_delete (XLSTREAM (da_outstream));
5944     }
5945
5946   *len_out = Dynarr_length (conversion_out_dynarr);
5947   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5948   return Dynarr_atp (conversion_out_dynarr, 0);
5949 }
5950
5951 Bufbyte *
5952 convert_from_external_format (CONST Extbyte *ptr,
5953                               Extcount len,
5954                               Bytecount *len_out,
5955                               enum external_data_format fmt)
5956 {
5957   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5958
5959   if (!conversion_in_dynarr)
5960     conversion_in_dynarr = Dynarr_new (Bufbyte);
5961   else
5962     Dynarr_reset (conversion_in_dynarr);
5963
5964   if (NILP (coding_system))
5965     {
5966       CONST Extbyte *end = ptr + len;
5967       for (; ptr < end; ptr++)
5968         {
5969           Extbyte c = *ptr;
5970           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5971         }
5972     }
5973   else
5974     {
5975       Lisp_Object instream, outstream, da_outstream;
5976       Lstream *istr, *ostr;
5977       struct gcpro gcpro1, gcpro2, gcpro3;
5978       char tempbuf[1024]; /* some random amount */
5979
5980       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5981       da_outstream = make_dynarr_output_stream
5982         ((unsigned_char_dynarr *) conversion_in_dynarr);
5983       outstream =
5984         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5985       istr = XLSTREAM (instream);
5986       ostr = XLSTREAM (outstream);
5987       GCPRO3 (instream, outstream, da_outstream);
5988       while (1)
5989         {
5990           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5991           if (!size_in_bytes)
5992             break;
5993           Lstream_write (ostr, tempbuf, size_in_bytes);
5994         }
5995       Lstream_close (istr);
5996       Lstream_close (ostr);
5997       UNGCPRO;
5998       Lstream_delete (istr);
5999       Lstream_delete (ostr);
6000       Lstream_delete (XLSTREAM (da_outstream));
6001     }
6002
6003   *len_out = Dynarr_length (conversion_in_dynarr);
6004   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
6005   return Dynarr_atp (conversion_in_dynarr, 0);
6006 }
6007
6008 \f
6009 /************************************************************************/
6010 /*                             Initialization                           */
6011 /************************************************************************/
6012
6013 void
6014 syms_of_file_coding (void)
6015 {
6016   defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
6017   deferror (&Qcoding_system_error, "coding-system-error",
6018             "Coding-system error", Qio_error);
6019
6020   DEFSUBR (Fcoding_system_p);
6021   DEFSUBR (Ffind_coding_system);
6022   DEFSUBR (Fget_coding_system);
6023   DEFSUBR (Fcoding_system_list);
6024   DEFSUBR (Fcoding_system_name);
6025   DEFSUBR (Fmake_coding_system);
6026   DEFSUBR (Fcopy_coding_system);
6027   DEFSUBR (Fdefine_coding_system_alias);
6028   DEFSUBR (Fsubsidiary_coding_system);
6029
6030   DEFSUBR (Fcoding_system_type);
6031   DEFSUBR (Fcoding_system_doc_string);
6032 #ifdef MULE
6033   DEFSUBR (Fcoding_system_charset);
6034 #endif
6035   DEFSUBR (Fcoding_system_property);
6036
6037   DEFSUBR (Fcoding_category_list);
6038   DEFSUBR (Fset_coding_priority_list);
6039   DEFSUBR (Fcoding_priority_list);
6040   DEFSUBR (Fset_coding_category_system);
6041   DEFSUBR (Fcoding_category_system);
6042
6043   DEFSUBR (Fdetect_coding_region);
6044   DEFSUBR (Fdecode_coding_region);
6045   DEFSUBR (Fencode_coding_region);
6046 #ifdef MULE
6047   DEFSUBR (Fdecode_shift_jis_char);
6048   DEFSUBR (Fencode_shift_jis_char);
6049   DEFSUBR (Fdecode_big5_char);
6050   DEFSUBR (Fencode_big5_char);
6051   DEFSUBR (Fset_ucs_char);
6052   DEFSUBR (Fucs_char);
6053   DEFSUBR (Fset_char_ucs);
6054   DEFSUBR (Fchar_ucs);
6055 #endif /* MULE */
6056   defsymbol (&Qcoding_system_p, "coding-system-p");
6057   defsymbol (&Qno_conversion, "no-conversion");
6058   defsymbol (&Qraw_text, "raw-text");
6059 #ifdef MULE
6060   defsymbol (&Qbig5, "big5");
6061   defsymbol (&Qshift_jis, "shift-jis");
6062   defsymbol (&Qucs4, "ucs-4");
6063   defsymbol (&Qutf8, "utf-8");
6064   defsymbol (&Qccl, "ccl");
6065   defsymbol (&Qiso2022, "iso2022");
6066 #endif /* MULE */
6067   defsymbol (&Qmnemonic, "mnemonic");
6068   defsymbol (&Qeol_type, "eol-type");
6069   defsymbol (&Qpost_read_conversion, "post-read-conversion");
6070   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6071
6072   defsymbol (&Qcr, "cr");
6073   defsymbol (&Qlf, "lf");
6074   defsymbol (&Qcrlf, "crlf");
6075   defsymbol (&Qeol_cr, "eol-cr");
6076   defsymbol (&Qeol_lf, "eol-lf");
6077   defsymbol (&Qeol_crlf, "eol-crlf");
6078 #ifdef MULE
6079   defsymbol (&Qcharset_g0, "charset-g0");
6080   defsymbol (&Qcharset_g1, "charset-g1");
6081   defsymbol (&Qcharset_g2, "charset-g2");
6082   defsymbol (&Qcharset_g3, "charset-g3");
6083   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6084   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6085   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6086   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6087   defsymbol (&Qno_iso6429, "no-iso6429");
6088   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6089   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6090
6091   defsymbol (&Qshort, "short");
6092   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6093   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6094   defsymbol (&Qseven, "seven");
6095   defsymbol (&Qlock_shift, "lock-shift");
6096   defsymbol (&Qescape_quoted, "escape-quoted");
6097 #endif /* MULE */
6098   defsymbol (&Qencode, "encode");
6099   defsymbol (&Qdecode, "decode");
6100
6101 #ifdef MULE
6102   defsymbol (&Qctext, "ctext");
6103   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6104              "shift-jis");
6105   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6106              "big5");
6107   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6108              "ucs-4");
6109   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6110              "utf-8");
6111   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6112              "iso-7");
6113   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6114              "iso-8-designate");
6115   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6116              "iso-8-1");
6117   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6118              "iso-8-2");
6119   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6120              "iso-lock-shift");
6121 #endif /* MULE */
6122   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6123              "no-conversion");
6124 }
6125
6126 void
6127 lstream_type_create_file_coding (void)
6128 {
6129   LSTREAM_HAS_METHOD (decoding, reader);
6130   LSTREAM_HAS_METHOD (decoding, writer);
6131   LSTREAM_HAS_METHOD (decoding, rewinder);
6132   LSTREAM_HAS_METHOD (decoding, seekable_p);
6133   LSTREAM_HAS_METHOD (decoding, flusher);
6134   LSTREAM_HAS_METHOD (decoding, closer);
6135   LSTREAM_HAS_METHOD (decoding, marker);
6136
6137   LSTREAM_HAS_METHOD (encoding, reader);
6138   LSTREAM_HAS_METHOD (encoding, writer);
6139   LSTREAM_HAS_METHOD (encoding, rewinder);
6140   LSTREAM_HAS_METHOD (encoding, seekable_p);
6141   LSTREAM_HAS_METHOD (encoding, flusher);
6142   LSTREAM_HAS_METHOD (encoding, closer);
6143   LSTREAM_HAS_METHOD (encoding, marker);
6144 }
6145
6146 void
6147 vars_of_file_coding (void)
6148 {
6149   int i;
6150
6151   /* Initialize to something reasonable ... */
6152   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
6153     {
6154       coding_category_system[i] = Qnil;
6155       coding_category_by_priority[i] = i;
6156     }
6157
6158   Fprovide (intern ("file-coding"));
6159
6160   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6161 Coding system used for TTY keyboard input.
6162 Not used under a windowing system.
6163 */ );
6164   Vkeyboard_coding_system = Qnil;
6165
6166   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6167 Coding system used for TTY display output.
6168 Not used under a windowing system.
6169 */ );
6170   Vterminal_coding_system = Qnil;
6171
6172   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6173 Overriding coding system used when writing a file or process.
6174 You should *bind* this, not set it.  If this is non-nil, it specifies
6175 the coding system that will be used when a file or process is read
6176 in, and overrides `buffer-file-coding-system-for-read',
6177 `insert-file-contents-pre-hook', etc.  Use those variables instead of
6178 this one for permanent changes to the environment.
6179 */ );
6180   Vcoding_system_for_read = Qnil;
6181
6182   DEFVAR_LISP ("coding-system-for-write",
6183                &Vcoding_system_for_write /*
6184 Overriding coding system used when writing a file or process.
6185 You should *bind* this, not set it.  If this is non-nil, it specifies
6186 the coding system that will be used when a file or process is wrote
6187 in, and overrides `buffer-file-coding-system',
6188 `write-region-pre-hook', etc.  Use those variables instead of this one
6189 for permanent changes to the environment.
6190 */ );
6191   Vcoding_system_for_write = Qnil;
6192
6193   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6194 Coding system used to convert pathnames when accessing files.
6195 */ );
6196   Vfile_name_coding_system = Qnil;
6197
6198   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6199 Non-nil means the buffer contents are regarded as multi-byte form
6200 of characters, not a binary code.  This affects the display, file I/O,
6201 and behaviors of various editing commands.
6202
6203 Setting this to nil does not do anything.
6204 */ );
6205   enable_multibyte_characters = 1;
6206 }
6207
6208 void
6209 complex_vars_of_file_coding (void)
6210 {
6211   staticpro (&Vcoding_system_hash_table);
6212   Vcoding_system_hash_table =
6213     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6214
6215   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6216
6217 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
6218 {                                               \
6219   struct codesys_prop csp;                      \
6220   csp.sym = (Sym);                              \
6221   csp.prop_type = (Prop_Type);                  \
6222   Dynarr_add (the_codesys_prop_dynarr, csp);    \
6223 } while (0)
6224
6225   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
6226   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
6227   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
6228   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
6229   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
6230   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
6231   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
6232 #ifdef MULE
6233   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6234   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6235   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6236   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6237   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6238   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6239   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6240   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6241   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6242   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6243   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6244   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6245   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6246   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6247   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6248   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6249   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6250
6251   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
6252   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
6253 #endif /* MULE */
6254   /* Need to create this here or we're really screwed. */
6255   Fmake_coding_system
6256     (Qraw_text, Qno_conversion,
6257      build_string ("Raw text, which means it converts only line-break-codes."),
6258      list2 (Qmnemonic, build_string ("Raw")));
6259
6260   Fmake_coding_system
6261     (Qbinary, Qno_conversion,
6262      build_string ("Binary, which means it does not convert anything."),
6263      list4 (Qeol_type, Qlf,
6264             Qmnemonic, build_string ("Binary")));
6265
6266 #ifdef UTF2000
6267   Fmake_coding_system
6268     (Qutf8, Qutf8,
6269      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6270      list2 (Qmnemonic, build_string ("UTF8")));
6271 #endif
6272
6273   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6274
6275   /* Need this for bootstrapping */
6276   coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6277     Fget_coding_system (Qraw_text);
6278
6279 #ifdef UTF2000
6280   coding_category_system[CODING_CATEGORY_UTF8]
6281    = Fget_coding_system (Qutf8);
6282 #endif
6283
6284 #ifdef MULE
6285   {
6286     unsigned int i;
6287
6288     for (i = 0; i < 65536; i++)
6289       ucs_to_mule_table[i] = Qnil;
6290   }
6291   staticpro (&mule_to_ucs_table);
6292   mule_to_ucs_table = Fmake_char_table(Qgeneric);
6293 #endif /* MULE */
6294 }