X-Git-Url: http://git.chise.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Ffile-coding.c;h=fe42ebdf0b4869793638f3c05bed93f9491908fd;hb=ac7d0619aad74b1d57c4748ebb3ab29d9c32e3d8;hp=9fce353df16f0d9573641b20baf5a4721d029f9c;hpb=59eec5f21669e81977b5b1fe9bf717cab49cf7fb;p=chise%2Fxemacs-chise.git diff --git a/src/file-coding.c b/src/file-coding.c index 9fce353..fe42ebd 100644 --- a/src/file-coding.c +++ b/src/file-coding.c @@ -46,17 +46,17 @@ Lisp_Object Vcoding_system_for_write; Lisp_Object Vfile_name_coding_system; /* Table of symbols identifying each coding category. */ -Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1]; +Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST]; struct file_coding_dump { /* Coding system currently associated with each coding category. */ - Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1]; + Lisp_Object coding_category_system[CODING_CATEGORY_LAST]; /* Table of all coding categories in decreasing order of priority. This describes a permutation of the possible coding categories. */ - int coding_category_by_priority[CODING_CATEGORY_LAST + 1]; + int coding_category_by_priority[CODING_CATEGORY_LAST]; #ifdef MULE Lisp_Object ucs_to_mule_table[65536]; @@ -64,7 +64,7 @@ struct file_coding_dump { } *fcd; static const struct lrecord_description fcd_description_1[] = { - { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 }, + { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST }, #ifdef MULE { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) }, #endif @@ -176,68 +176,47 @@ EXFUN (Fcopy_coding_system, 2); #ifdef MULE struct detection_state; static int detect_coding_sjis (struct detection_state *st, - const unsigned char *src, - unsigned int n); -static void decode_coding_sjis (Lstream *decoding, - const unsigned char *src, - unsigned_char_dynarr *dst, - unsigned int n); -static void encode_coding_sjis (Lstream *encoding, - const unsigned char *src, - unsigned_char_dynarr *dst, - unsigned int n); + const Extbyte *src, Lstream_data_count n); +static void decode_coding_sjis (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); +static void encode_coding_sjis (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); static int detect_coding_big5 (struct detection_state *st, - const unsigned char *src, - unsigned int n); -static void decode_coding_big5 (Lstream *decoding, - const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); -static void encode_coding_big5 (Lstream *encoding, - const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); + const Extbyte *src, Lstream_data_count n); +static void decode_coding_big5 (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); +static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); static int detect_coding_ucs4 (struct detection_state *st, - const unsigned char *src, - unsigned int n); -static void decode_coding_ucs4 (Lstream *decoding, - const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); -static void encode_coding_ucs4 (Lstream *encoding, - const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); + const Extbyte *src, Lstream_data_count n); +static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); +static void encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); static int detect_coding_utf8 (struct detection_state *st, - const unsigned char *src, - unsigned int n); -static void decode_coding_utf8 (Lstream *decoding, - const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); -static void encode_coding_utf8 (Lstream *encoding, - const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); + const Extbyte *src, Lstream_data_count n); +static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); +static void encode_coding_utf8 (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); static int postprocess_iso2022_mask (int mask); static void reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso); static int detect_coding_iso2022 (struct detection_state *st, - const unsigned char *src, - unsigned int n); -static void decode_coding_iso2022 (Lstream *decoding, - const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); -static void encode_coding_iso2022 (Lstream *encoding, - const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); + const Extbyte *src, Lstream_data_count n); +static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); +static void encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); #endif /* MULE */ -static void decode_coding_no_conversion (Lstream *decoding, - const unsigned char *src, - unsigned_char_dynarr *dst, - unsigned int n); -static void encode_coding_no_conversion (Lstream *encoding, - const unsigned char *src, - unsigned_char_dynarr *dst, - unsigned int n); -static void mule_decode (Lstream *decoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); -static void mule_encode (Lstream *encoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); +static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); +static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); +static void mule_decode (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); +static void mule_encode (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n); typedef struct codesys_prop codesys_prop; struct codesys_prop @@ -789,12 +768,12 @@ character set. Recognized properties are: 'post-read-conversion Function called after a file has been read in, to perform the - decoding. Called with two arguments, BEG and END, denoting + decoding. Called with two arguments, START and END, denoting a region of the current buffer to be decoded. 'pre-write-conversion Function called before a file is written out, to perform the - encoding. Called with two arguments, BEG and END, denoting + encoding. Called with two arguments, START and END, denoting a region of the current buffer to be encoded. @@ -983,18 +962,42 @@ if TYPE is 'ccl: } else if (EQ (type, Qccl)) { + Lisp_Object sym; + struct ccl_program test_ccl; + Extbyte *suffix; + + /* Check key first. */ if (EQ (key, Qdecode)) - { - CHECK_VECTOR (value); - CODING_SYSTEM_CCL_DECODE (codesys) = value; - } + suffix = "-ccl-decode"; else if (EQ (key, Qencode)) + suffix = "-ccl-encode"; + else + signal_simple_error ("Unrecognized property", key); + + /* If value is vector, register it as a ccl program + associated with an newly created symbol for + backward compatibility. */ + if (VECTORP (value)) { - CHECK_VECTOR (value); - CODING_SYSTEM_CCL_ENCODE (codesys) = value; + sym = Fintern (concat2 (Fsymbol_name (name), + build_string (suffix)), + Qnil); + Fregister_ccl_program (sym, value); } else - signal_simple_error ("Unrecognized property", key); + { + CHECK_SYMBOL (value); + sym = value; + } + /* check if the given ccl programs are valid. */ + if (setup_ccl_program (&test_ccl, sym) < 0) + signal_simple_error ("Invalid CCL program", value); + + if (EQ (key, Qdecode)) + CODING_SYSTEM_CCL_DECODE (codesys) = sym; + else if (EQ (key, Qencode)) + CODING_SYSTEM_CCL_ENCODE (codesys) = sym; + } #endif /* MULE */ else @@ -1430,7 +1433,7 @@ decode_coding_category (Lisp_Object symbol) int i; CHECK_SYMBOL (symbol); - for (i = 0; i <= CODING_CATEGORY_LAST; i++) + for (i = 0; i < CODING_CATEGORY_LAST; i++) if (EQ (coding_category_symbol[i], symbol)) return i; @@ -1446,7 +1449,7 @@ Return a list of all recognized coding categories. int i; Lisp_Object list = Qnil; - for (i = CODING_CATEGORY_LAST; i >= 0; i--) + for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) list = Fcons (coding_category_symbol[i], list); return list; } @@ -1460,13 +1463,13 @@ previously. */ (list)) { - int category_to_priority[CODING_CATEGORY_LAST + 1]; + int category_to_priority[CODING_CATEGORY_LAST]; int i, j; Lisp_Object rest; /* First generate a list that maps coding categories to priorities. */ - for (i = 0; i <= CODING_CATEGORY_LAST; i++) + for (i = 0; i < CODING_CATEGORY_LAST; i++) category_to_priority[i] = -1; /* Highest priority comes from the specified list. */ @@ -1483,7 +1486,7 @@ previously. /* Now go through the existing categories by priority to retrieve the categories not yet specified and preserve their priority order. */ - for (j = 0; j <= CODING_CATEGORY_LAST; j++) + for (j = 0; j < CODING_CATEGORY_LAST; j++) { int cat = fcd->coding_category_by_priority[j]; if (category_to_priority[cat] < 0) @@ -1493,7 +1496,7 @@ previously. /* Now we need to construct the inverse of the mapping we just constructed. */ - for (i = 0; i <= CODING_CATEGORY_LAST; i++) + for (i = 0; i < CODING_CATEGORY_LAST; i++) fcd->coding_category_by_priority[category_to_priority[i]] = i; /* Phew! That was confusing. */ @@ -1508,7 +1511,7 @@ Return a list of coding categories in descending order of priority. int i; Lisp_Object list = Qnil; - for (i = CODING_CATEGORY_LAST; i >= 0; i--) + for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]], list); return list; @@ -1628,14 +1631,12 @@ mask_has_at_most_one_bit_p (int mask) } static eol_type_t -detect_eol_type (struct detection_state *st, const unsigned char *src, - unsigned int n) +detect_eol_type (struct detection_state *st, const Extbyte *src, + Lstream_data_count n) { - int c; - while (n--) { - c = *src++; + unsigned char c = *(unsigned char *)src++; if (c == '\n') { if (st->eol.just_saw_cr) @@ -1674,10 +1675,8 @@ detect_eol_type (struct detection_state *st, const unsigned char *src, static int detect_coding_type (struct detection_state *st, const Extbyte *src, - unsigned int n, int just_do_eol) + Lstream_data_count n, int just_do_eol) { - int c; - if (st->eol_type == EOL_AUTODETECT) st->eol_type = detect_eol_type (st, src, n); @@ -1688,7 +1687,7 @@ detect_coding_type (struct detection_state *st, const Extbyte *src, { for (; n; n--, src++) { - c = *src; + unsigned char c = *(unsigned char *) src; if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80) { st->seen_non_ascii = 1; @@ -1762,7 +1761,7 @@ coding_system_from_mask (int mask) #endif /* Look through the coding categories by priority and find the first one that is allowed. */ - for (i = 0; i <= CODING_CATEGORY_LAST; i++) + for (i = 0; i < CODING_CATEGORY_LAST; i++) { cat = fcd->coding_category_by_priority[i]; if ((mask & (1 << cat)) && @@ -1786,6 +1785,8 @@ coding_system_from_mask (int mask) that should be unnecessary with the explicit eol-type argument. */ #define LENGTH(string_constant) (sizeof (string_constant) - 1) +/* number of leading lines to check for a coding cookie */ +#define LINES_TO_CHECK 2 void determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out, @@ -1807,15 +1808,15 @@ determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out, Extbyte buf[4096]; Lisp_Object coding_system = Qnil; Extbyte *p; - ssize_t nread = Lstream_read (stream, buf, sizeof (buf)); + Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf)); Extbyte *scan_end; + int lines_checked = 0; /* Look for initial "-*-"; mode line prefix */ for (p = buf, scan_end = buf + nread - LENGTH ("-*-coding:?-*-"); p <= scan_end - && *p != '\n' - && *p != '\r'; + && lines_checked < LINES_TO_CHECK; p++) if (*p == '-' && *(p+1) == '*' && *(p+2) == '-') { @@ -1824,8 +1825,7 @@ determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out, for (p = local_vars_beg, scan_end = buf + nread - LENGTH ("-*-"); p <= scan_end - && *p != '\n' - && *p != '\r'; + && lines_checked < LINES_TO_CHECK; p++) if (*p == '-' && *(p+1) == '*' && *(p+2) == '-') { @@ -1867,8 +1867,24 @@ determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out, } break; } + /* #### file must use standard EOLs or we miss 2d line */ + /* #### not to mention this is broken for UTF-16 DOS files */ + else if (*p == '\n' || *p == '\r') + { + lines_checked++; + /* skip past multibyte (DOS) newline */ + if (*p == '\r' && *(p+1) == '\n') p++; + } break; } + /* #### file must use standard EOLs or we miss 2d line */ + /* #### not to mention this is broken for UTF-16 DOS files */ + else if (*p == '\n' || *p == '\r') + { + lines_checked++; + /* skip past multibyte (DOS) newline */ + if (*p == '\r' && *(p+1) == '\n') p++; + } if (NILP (coding_system)) do @@ -1914,8 +1930,8 @@ determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out, DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /* Detect coding system of the text in the region between START and END. -Returned a list of possible coding systems ordered by priority. -If only ASCII characters are found, it returns 'undecided or one of +Return a list of possible coding systems ordered by priority. +If only ASCII characters are found, return 'undecided or one of its subsidiary coding systems according to a detected end-of-line type. Optional arg BUFFER defaults to the current buffer. */ @@ -1940,8 +1956,8 @@ type. Optional arg BUFFER defaults to the current buffer. decst.mask = ~0; while (1) { - unsigned char random_buffer[4096]; - ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer)); + Extbyte random_buffer[4096]; + Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer)); if (!nread) break; @@ -1960,7 +1976,7 @@ type. Optional arg BUFFER defaults to the current buffer. #ifdef MULE decst.mask = postprocess_iso2022_mask (decst.mask); #endif - for (i = CODING_CATEGORY_LAST; i >= 0; i--) + for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) { int sys = fcd->coding_category_by_priority[i]; if (decst.mask & (1 << sys)) @@ -2108,10 +2124,10 @@ struct decoding_stream struct detection_state decst; }; -static ssize_t decoding_reader (Lstream *stream, - unsigned char *data, size_t size); -static ssize_t decoding_writer (Lstream *stream, - const unsigned char *data, size_t size); +static Lstream_data_count decoding_reader (Lstream *stream, + unsigned char *data, Lstream_data_count size); +static Lstream_data_count decoding_writer (Lstream *stream, + const unsigned char *data, Lstream_data_count size); static int decoding_rewinder (Lstream *stream); static int decoding_seekable_p (Lstream *stream); static int decoding_flusher (Lstream *stream); @@ -2143,12 +2159,12 @@ decoding_marker (Lisp_Object stream) /* Read SIZE bytes of data and store it into DATA. We are a decoding stream so we read data from the other end, decode it, and store it into DATA. */ -static ssize_t -decoding_reader (Lstream *stream, unsigned char *data, size_t size) +static Lstream_data_count +decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); unsigned char *orig_data = data; - ssize_t read_size; + Lstream_data_count read_size; int error_occurred = 0; /* We need to interface to mule_decode(), which expects to take some @@ -2164,7 +2180,7 @@ decoding_reader (Lstream *stream, unsigned char *data, size_t size) most SIZE bytes, and delete the data from the runoff. */ if (Dynarr_length (str->runoff) > 0) { - size_t chunk = min (size, (size_t) Dynarr_length (str->runoff)); + Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff)); memcpy (data, Dynarr_atp (str->runoff, 0), chunk); Dynarr_delete_many (str->runoff, 0, chunk); data += chunk; @@ -2196,7 +2212,7 @@ decoding_reader (Lstream *stream, unsigned char *data, size_t size) /* There might be some more end data produced in the translation. See the comment above. */ str->flags |= CODING_STATE_END; - mule_decode (stream, data, str->runoff, read_size); + mule_decode (stream, (Extbyte *) data, str->runoff, read_size); } if (data - orig_data == 0) @@ -2205,16 +2221,16 @@ decoding_reader (Lstream *stream, unsigned char *data, size_t size) return data - orig_data; } -static ssize_t -decoding_writer (Lstream *stream, const unsigned char *data, size_t size) +static Lstream_data_count +decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); - ssize_t retval; + Lstream_data_count retval; /* Decode all our data into the runoff, and then attempt to write it all out to the other end. Remove whatever chunk we succeeded in writing. */ - mule_decode (stream, data, str->runoff, size); + mule_decode (stream, (Extbyte *) data, str->runoff, size); retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0), Dynarr_length (str->runoff)); if (retval > 0) @@ -2366,8 +2382,8 @@ make_decoding_output_stream (Lstream *stream, Lisp_Object codesys) be used for both reading and writing. */ static void -mule_decode (Lstream *decoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +mule_decode (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); @@ -2431,7 +2447,10 @@ mule_decode (Lstream *decoding, const unsigned char *src, break; case CODESYS_CCL: str->ccl.last_block = str->flags & CODING_STATE_END; - ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING); + /* When applying ccl program to stream, MUST NOT set NULL + pointer to src. */ + ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""), + dst, n, 0, CCL_MODE_DECODING); break; case CODESYS_ISO2022: decode_coding_iso2022 (decoding, src, dst, n); @@ -2486,7 +2505,7 @@ BUFFER defaults to the current buffer if unspecified. char tempbuf[1024]; /* some random amount */ Bufpos newpos, even_newer_pos; Bufpos oldpos = lisp_buffer_stream_startpos (istr); - ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); + Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); if (!size_in_bytes) break; @@ -2571,9 +2590,9 @@ struct encoding_stream #endif /* MULE */ }; -static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size); -static ssize_t encoding_writer (Lstream *stream, const unsigned char *data, - size_t size); +static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size); +static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data, + Lstream_data_count size); static int encoding_rewinder (Lstream *stream); static int encoding_seekable_p (Lstream *stream); static int encoding_flusher (Lstream *stream); @@ -2605,12 +2624,12 @@ encoding_marker (Lisp_Object stream) /* Read SIZE bytes of data and store it into DATA. We are a encoding stream so we read data from the other end, encode it, and store it into DATA. */ -static ssize_t -encoding_reader (Lstream *stream, unsigned char *data, size_t size) +static Lstream_data_count +encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); unsigned char *orig_data = data; - ssize_t read_size; + Lstream_data_count read_size; int error_occurred = 0; /* We need to interface to mule_encode(), which expects to take some @@ -2667,11 +2686,11 @@ encoding_reader (Lstream *stream, unsigned char *data, size_t size) return data - orig_data; } -static ssize_t -encoding_writer (Lstream *stream, const unsigned char *data, size_t size) +static Lstream_data_count +encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); - ssize_t retval; + Lstream_data_count retval; /* Encode all our data into the runoff, and then attempt to write it all out to the other end. Remove whatever chunk we succeeded @@ -2810,8 +2829,8 @@ make_encoding_output_stream (Lstream *stream, Lisp_Object codesys) Store the encoded data into DST. */ static void -mule_encode (Lstream *encoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +mule_encode (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); @@ -2843,7 +2862,10 @@ mule_encode (Lstream *encoding, const unsigned char *src, break; case CODESYS_CCL: str->ccl.last_block = str->flags & CODING_STATE_END; - ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING); + /* When applying ccl program to stream, MUST NOT set NULL + pointer to src. */ + ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""), + dst, n, 0, CCL_MODE_ENCODING); break; case CODESYS_ISO2022: encode_coding_iso2022 (encoding, src, dst, n); @@ -2894,7 +2916,7 @@ text. BUFFER defaults to the current buffer if unspecified. char tempbuf[1024]; /* some random amount */ Bufpos newpos, even_newer_pos; Bufpos oldpos = lisp_buffer_stream_startpos (istr); - ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); + Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); if (!size_in_bytes) break; @@ -2957,14 +2979,11 @@ text. BUFFER defaults to the current buffer if unspecified. ((c) >= 0xA1 && (c) <= 0xDF) static int -detect_coding_sjis (struct detection_state *st, const unsigned char *src, - unsigned int n) +detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n) { - int c; - while (n--) { - c = *src++; + unsigned char c = *(unsigned char *)src++; if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) return 0; if (st->shift_jis.in_second_byte) @@ -2982,10 +3001,9 @@ detect_coding_sjis (struct detection_state *st, const unsigned char *src, /* Convert Shift-JIS data to internal format. */ static void -decode_coding_sjis (Lstream *decoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +decode_coding_sjis (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { - unsigned char c; struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; unsigned int ch = str->ch; @@ -2993,7 +3011,7 @@ decode_coding_sjis (Lstream *decoding, const unsigned char *src, while (n--) { - c = *src++; + unsigned char c = *(unsigned char *)src++; if (ch) { @@ -3039,10 +3057,9 @@ decode_coding_sjis (Lstream *decoding, const unsigned char *src, /* Convert internally-formatted data to Shift-JIS. */ static void -encode_coding_sjis (Lstream *encoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +encode_coding_sjis (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { - unsigned char c; struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags = str->flags; unsigned int ch = str->ch; @@ -3050,7 +3067,7 @@ encode_coding_sjis (Lstream *encoding, const unsigned char *src, while (n--) { - c = *src++; + Bufbyte c = *src++; if (c == '\n') { if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) @@ -3119,16 +3136,16 @@ Return the corresponding character. } DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /* -Encode a JISX0208 character CHAR to SHIFT-JIS coding-system. +Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system. Return the corresponding character code in SHIFT-JIS as a cons of two bytes. */ - (ch)) + (character)) { Lisp_Object charset; int c1, c2, s1, s2; - CHECK_CHAR_COERCE_INT (ch); - BREAKUP_CHAR (XCHAR (ch), charset, c1, c2); + CHECK_CHAR_COERCE_INT (character); + BREAKUP_CHAR (XCHAR (character), charset, c1, c2); if (EQ (charset, Vcharset_japanese_jisx0208)) { ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2); @@ -3243,14 +3260,11 @@ Return the corresponding character code in SHIFT-JIS as a cons of two bytes. } while (0) static int -detect_coding_big5 (struct detection_state *st, const unsigned char *src, - unsigned int n) +detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n) { - int c; - while (n--) { - c = *src++; + unsigned char c = *(unsigned char *)src++; if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO || (c >= 0x80 && c <= 0xA0)) return 0; @@ -3269,10 +3283,9 @@ detect_coding_big5 (struct detection_state *st, const unsigned char *src, /* Convert Big5 data to internal format. */ static void -decode_coding_big5 (Lstream *decoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +decode_coding_big5 (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { - unsigned char c; struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; unsigned int ch = str->ch; @@ -3280,7 +3293,7 @@ decode_coding_big5 (Lstream *decoding, const unsigned char *src, while (n--) { - c = *src++; + unsigned char c = *(unsigned char *)src++; if (ch) { /* Previous character was first byte of Big5 char. */ @@ -3319,8 +3332,8 @@ decode_coding_big5 (Lstream *decoding, const unsigned char *src, /* Convert internally-formatted data to Big5. */ static void -encode_coding_big5 (Lstream *encoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +encode_coding_big5 (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { unsigned char c; struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); @@ -3407,16 +3420,16 @@ Return the corresponding character. } DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /* -Encode the Big5 character CH to BIG5 coding-system. +Encode the Big5 character CHARACTER in the BIG5 coding-system. Return the corresponding character code in Big5. */ - (ch)) + (character)) { Lisp_Object charset; int c1, c2, b1, b2; - CHECK_CHAR_COERCE_INT (ch); - BREAKUP_CHAR (XCHAR (ch), charset, c1, c2); + CHECK_CHAR_COERCE_INT (character); + BREAKUP_CHAR (XCHAR (character), charset, c1, c2); if (EQ (charset, Vcharset_chinese_big5_1) || EQ (charset, Vcharset_chinese_big5_2)) { @@ -3586,12 +3599,11 @@ encode_ucs4 (Lisp_Object charset, } static int -detect_coding_ucs4 (struct detection_state *st, const unsigned char *src, - unsigned int n) +detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n) { while (n--) { - int c = *src++; + unsigned char c = *(unsigned char *)src++; switch (st->ucs4.in_byte) { case 0: @@ -3611,8 +3623,8 @@ detect_coding_ucs4 (struct detection_state *st, const unsigned char *src, } static void -decode_coding_ucs4 (Lstream *decoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +decode_coding_ucs4 (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; @@ -3621,7 +3633,7 @@ decode_coding_ucs4 (Lstream *decoding, const unsigned char *src, while (n--) { - unsigned char c = *src++; + unsigned char c = *(unsigned char *)src++; switch (counter) { case 0: @@ -3647,8 +3659,8 @@ decode_coding_ucs4 (Lstream *decoding, const unsigned char *src, } static void -encode_coding_ucs4 (Lstream *encoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags = str->flags; @@ -3713,9 +3725,9 @@ encode_coding_ucs4 (Lstream *encoding, const unsigned char *src, { /* #### Bother! We don't know how to handle this yet. */ - Dynarr_add (dst, 0); - Dynarr_add (dst, 0); - Dynarr_add (dst, 0); + Dynarr_add (dst, '\0'); + Dynarr_add (dst, '\0'); + Dynarr_add (dst, '\0'); Dynarr_add (dst, '~'); } else @@ -3786,12 +3798,11 @@ encode_coding_ucs4 (Lstream *encoding, const unsigned char *src, /************************************************************************/ static int -detect_coding_utf8 (struct detection_state *st, const unsigned char *src, - unsigned int n) +detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n) { while (n--) { - unsigned char c = *src++; + unsigned char c = *(unsigned char *)src++; switch (st->utf8.in_byte) { case 0: @@ -3821,8 +3832,8 @@ detect_coding_utf8 (struct detection_state *st, const unsigned char *src, } static void -decode_coding_utf8 (Lstream *decoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +decode_coding_utf8 (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; @@ -3832,7 +3843,7 @@ decode_coding_utf8 (Lstream *decoding, const unsigned char *src, while (n--) { - unsigned char c = *src++; + unsigned char c = *(unsigned char *)src++; switch (counter) { case 0: @@ -3935,8 +3946,8 @@ encode_utf8 (Lisp_Object charset, } static void -encode_coding_utf8 (Lstream *encoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +encode_coding_utf8 (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags = str->flags; @@ -4288,7 +4299,48 @@ fit_to_be_escape_quoted (unsigned char c) If CHECK_INVALID_CHARSETS is non-zero, check for designation or invocation of an invalid character set and treat that as - an unrecognized escape sequence. */ + an unrecognized escape sequence. + + ******************************************************************** + + #### Strategies for error annotation and coding orthogonalization + + We really want to separate out a number of things. Conceptually, + there is a nested syntax. + + At the top level is the ISO 2022 extension syntax, including charset + designation and invocation, and certain auxiliary controls such as the + ISO 6429 direction specification. These are octet-oriented, with the + single exception (AFAIK) of the "exit Unicode" sequence which uses the + UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and + UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a + (deprecated) special case in Unicode processing. + + The middle layer is ISO 2022 character interpretation. This will depend + on the current state of the ISO 2022 registers, and assembles octets + into the character's internal representation. + + The lowest level is translating system control conventions. At present + this is restricted to newline translation, but one could imagine doing + tab conversion or line wrapping here. "Escape from Unicode" processing + would be done at this level. + + At each level the parser will verify the syntax. In the case of a + syntax error or warning (such as a redundant escape sequence that affects + no characters), the parser will take some action, typically inserting the + erroneous octets directly into the output and creating an annotation + which can be used by higher level I/O to mark the affected region. + + This should make it possible to do something sensible about separating + newline convention processing from character construction, and about + preventing ISO 2022 escape sequences from being recognized + inappropriately. + + The basic strategy will be to have octet classification tables, and + switch processing according to the table entry. + + It's possible that, by doing the processing with tables of functions or + the like, the parser can be used for both detection and translation. */ static int parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso, @@ -4654,8 +4706,7 @@ parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso, } static int -detect_coding_iso2022 (struct detection_state *st, const unsigned char *src, - unsigned int n) +detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n) { int mask; @@ -4685,7 +4736,7 @@ detect_coding_iso2022 (struct detection_state *st, const unsigned char *src, while (n--) { - int c = *src++; + unsigned char c = *(unsigned char *)src++; if (c >= 0xA0) { mask &= ~CODING_CATEGORY_ISO_7_MASK; @@ -4845,8 +4896,8 @@ ensure_correct_direction (int direction, Lisp_Coding_System *codesys, /* Convert ISO2022-format data to internal format. */ static void -decode_coding_iso2022 (Lstream *decoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; @@ -4866,7 +4917,7 @@ decode_coding_iso2022 (Lstream *decoding, const unsigned char *src, while (n--) { - unsigned char c = *src++; + unsigned char c = *(unsigned char *)src++; if (flags & CODING_STATE_ESCAPE) { /* Within ESC sequence */ int retval = parse_iso2022_esc (coding_system, &str->iso2022, @@ -5171,8 +5222,8 @@ ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst) /* Convert internally-formatted data to ISO2022 format. */ static void -encode_coding_iso2022 (Lstream *encoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { unsigned char charmask, c; unsigned char char_boundary; @@ -5480,10 +5531,9 @@ encode_coding_iso2022 (Lstream *encoding, const unsigned char *src, contain all 256 possible byte values and that are not to be interpreted as being in any particular decoding. */ static void -decode_coding_no_conversion (Lstream *decoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +decode_coding_no_conversion (Lstream *decoding, const Extbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { - unsigned char c; struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; unsigned int ch = str->ch; @@ -5491,7 +5541,7 @@ decode_coding_no_conversion (Lstream *decoding, const unsigned char *src, while (n--) { - c = *src++; + unsigned char c = *(unsigned char *)src++; DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); DECODE_ADD_BINARY_CHAR (c, dst); @@ -5505,8 +5555,8 @@ decode_coding_no_conversion (Lstream *decoding, const unsigned char *src, } static void -encode_coding_no_conversion (Lstream *encoding, const unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) +encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src, + unsigned_char_dynarr *dst, Lstream_data_count n) { unsigned char c; struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); @@ -5705,10 +5755,10 @@ vars_of_file_coding (void) int i; fcd = xnew (struct file_coding_dump); - dumpstruct (&fcd, &fcd_description); + dump_add_root_struct_ptr (&fcd, &fcd_description); /* Initialize to something reasonable ... */ - for (i = 0; i <= CODING_CATEGORY_LAST; i++) + for (i = 0; i < CODING_CATEGORY_LAST; i++) { fcd->coding_category_system[i] = Qnil; fcd->coding_category_by_priority[i] = i; @@ -5772,7 +5822,7 @@ complex_vars_of_file_coding (void) make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ); the_codesys_prop_dynarr = Dynarr_new (codesys_prop); - dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description); + dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description); #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \ { \