From: handa Date: Mon, 21 Jun 2004 00:44:10 +0000 (+0000) Subject: (INC_POSITION): Use CHAR_UNITS_BY_HEAD_UTF16. X-Git-Tag: withdl~59 X-Git-Url: http://git.chise.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e7a0725f2ae1b9ce41758f1daac601e41d31cabc;p=m17n%2Fm17n-lib.git (INC_POSITION): Use CHAR_UNITS_BY_HEAD_UTF16. (compare): Pay attention to format other than utf-8. (copy): Delete this function. (count_by_utf_8, count_by_utf_16, insert): New functions. (count_utf_16_chars): Fix handling of a surrogate pair. (find_char_forward, find_char_backward): Likewise. (mtext__from_data): Delete unnecessary check. Fix number of allocated bytes. (mtext_from_data): Don't count items. (mtext_ref_char): Optimize the code. (mtext_set_char): Pay attention to format other than utf-8. (mtext_cat_char): Likewise. (mtext_dup): Don't call copy, instead do allocation here. (mtext_cat): Call insert instead of copy. (mtext_ncat): Likewise. (mtext_cpy): Delete character at first and call insert instead of copy. (mtext_ncpy): Likewise. (mtext_copy): Likewise. (mtext_duplicate): Call insert instead of copy. (mtext_del): Pay attention to format other than utf-8. (mtext_ins): Simply call insert. (mtext_ins_char): Pay attention to format other than utf-8. (mtext_tok): Call insert instead of copy. (mtext_text): Call UNIT_BYTES. --- diff --git a/src/mtext.c b/src/mtext.c index 0601439..5163323 100644 --- a/src/mtext.c +++ b/src/mtext.c @@ -104,7 +104,7 @@ static enum MTextFormat default_utf_16 = MTEXT_FORMAT_UTF_16LE; static enum MTextFormat default_utf_32 = MTEXT_FORMAT_UTF_32LE; #endif -/** Increment character position CHAR_POS and byte position UNIT_POS +/** Increment character position CHAR_POS and unit position UNIT_POS so that they point to the next character in M-text MT. No range check for CHAR_POS and UNIT_POS. */ @@ -123,7 +123,7 @@ static enum MTextFormat default_utf_32 = MTEXT_FORMAT_UTF_32LE; \ if ((mt)->format != default_utf_16) \ c = SWAP_16 (c); \ - (unit_pos) += (c < 0xD800 || c >= 0xE000) ? 1 : 2; \ + (unit_pos) += CHAR_UNITS_BY_HEAD_UTF16 (c); \ } \ else \ (unit_pos)++; \ @@ -131,7 +131,7 @@ static enum MTextFormat default_utf_32 = MTEXT_FORMAT_UTF_32LE; } while (0) -/** Decrement character position CHAR_POS and byte position UNIT_POS +/** Decrement character position CHAR_POS and unit position UNIT_POS so that they point to the previous character in M-text MT. No range check for CHAR_POS and UNIT_POS. */ @@ -151,7 +151,7 @@ static enum MTextFormat default_utf_32 = MTEXT_FORMAT_UTF_32LE; \ if ((mt)->format != default_utf_16) \ c = SWAP_16 (c); \ - (unit_pos) -= (c < 0xD800 || c >= 0xE000) ? 1 : 2; \ + (unit_pos) -= 2 - (c < 0xD800 || c >= 0xE000); \ } \ else \ (unit_pos)--; \ @@ -159,6 +159,9 @@ static enum MTextFormat default_utf_32 = MTEXT_FORMAT_UTF_32LE; } while (0) +/* Compoare sub-texts in MT1 (range FROM1 and TO1) and MT2 (range + FROM2 to TO2). */ + static int compare (MText *mt1, int from1, int to1, MText *mt2, int from2, int to2) { @@ -166,17 +169,24 @@ compare (MText *mt1, int from1, int to1, MText *mt2, int from2, int to2) && (mt1->format <= MTEXT_FORMAT_UTF_8)) { unsigned char *p1, *pend1, *p2, *pend2; + int unit_bytes = UNIT_BYTES (mt1->format); + int nbytes; + int result; - p1 = mt1->data + mtext__char_to_byte (mt1, from1); - pend1 = mt1->data + mtext__char_to_byte (mt1, to1); + p1 = mt1->data + mtext__char_to_byte (mt1, from1) * unit_bytes; + pend1 = mt1->data + mtext__char_to_byte (mt1, to1) * unit_bytes; - p2 = mt2->data + mtext__char_to_byte (mt2, from2); - pend2 = mt2->data + mtext__char_to_byte (mt2, to2); + p2 = mt2->data + mtext__char_to_byte (mt2, from2) * unit_bytes; + pend2 = mt2->data + mtext__char_to_byte (mt2, to2) * unit_bytes; - for (; p1 < pend1 && p2 < pend2; p1++, p2++) - if (*p1 != *p2) - return (*p1 > *p2 ? 1 : -1); - return (p2 == pend2 ? (p1 < pend1) : -1); + if (pend1 - p1 < pend2 - p2) + nbytes = pend1 - p1; + else + nbytes = pend2 - p2; + result = memcmp (p1, p2, nbytes); + if (result) + return result; + return ((pend1 - p1) - (pend2 - p2)); } for (; from1 < to1 && from2 < to2; from1++, from2++) { @@ -189,68 +199,173 @@ compare (MText *mt1, int from1, int to1, MText *mt2, int from2, int to2) return (from2 == to2 ? (from1 < to1) : -1); } -static MText * -copy (MText *mt1, int pos, MText *mt2, int from, int to) + +/* Return how many units are required in UTF-8 to represent characters + between FROM and TO of MT. */ + +static int +count_by_utf_8 (MText *mt, int from, int to) { - int pos_byte = POS_CHAR_TO_BYTE (mt1, pos); - int nbytes; - struct MTextPlist *plist; - unsigned char *p; + int n, c; - if (mt2->format <= MTEXT_FORMAT_UTF_8) + for (n = 0; from < to; from++) { - int from_byte = POS_CHAR_TO_BYTE (mt2, from); - - p = mt2->data + from_byte; - nbytes = POS_CHAR_TO_BYTE (mt2, to) - from_byte; + c = mtext_ref_char (mt, from); + n += CHAR_UNITS_UTF8 (c); } - else + return n; +} + + +/* Return how many units are required in UTF-16 to represent + characters between FROM and TO of MT. */ + +static int +count_by_utf_16 (MText *mt, int from, int to) +{ + int n, c; + + for (n = 0; from < to; from++) { - unsigned char *p1; - int pos1; + c = mtext_ref_char (mt, from); + n += CHAR_UNITS_UTF16 (c); + } + return n; +} - p = p1 = alloca (MAX_UNICODE_CHAR_BYTES * (to - from)); - for (pos1 = from; pos1 < to; pos1++) + +/* Insert text between FROM and TO of MT2 at POS of MT1. */ + +static MText * +insert (MText *mt1, int pos, MText *mt2, int from, int to) +{ + int pos_unit = POS_CHAR_TO_BYTE (mt1, pos); + int from_unit = POS_CHAR_TO_BYTE (mt2, from); + int new_units = POS_CHAR_TO_BYTE (mt2, to) - from_unit; + int unit_bytes; + + if (mt1->nchars == 0) + mt1->format = mt2->format; + else if (mt1->format != mt2->format) + { + /* Be sure to make mt1->format sufficient to contain all + characters in mt2. */ + if (mt1->format == MTEXT_FORMAT_UTF_8 + || mt1->format == default_utf_32 + || (mt1->format == default_utf_16 + && mt2->format <= MTEXT_FORMAT_UTF_16BE + && mt2->format != MTEXT_FORMAT_UTF_8)) + ; + else if (mt1->format == MTEXT_FORMAT_US_ASCII) { - int c = mtext_ref_char (mt2, pos1); - p1 += CHAR_STRING (c, p1); + if (mt2->format == MTEXT_FORMAT_UTF_8) + mt1->format = MTEXT_FORMAT_UTF_8; + else if (mt2->format == default_utf_16 + || mt2->format == default_utf_32) + mtext__adjust_format (mt1, mt2->format); + else + mtext__adjust_format (mt1, MTEXT_FORMAT_UTF_8); + } + else + { + mtext__adjust_format (mt1, MTEXT_FORMAT_UTF_8); + pos_unit = POS_CHAR_TO_BYTE (mt1, pos); } - nbytes = p1 - p; } - if (mt1->cache_char_pos > pos) + unit_bytes = UNIT_BYTES (mt1->format); + + if (mt1->format == mt2->format) { - mt1->cache_char_pos = pos; - mt1->cache_byte_pos = pos_byte; - } + int pos_byte = pos_unit * unit_bytes; + int total_bytes = (mt1->nbytes + new_units) * unit_bytes; + int new_bytes = new_units * unit_bytes; - if (pos_byte + nbytes >= mt1->allocated) + if (total_bytes + unit_bytes > mt1->allocated) + { + mt1->allocated = total_bytes + unit_bytes; + MTABLE_REALLOC (mt1->data, mt1->allocated, MERROR_MTEXT); + } + if (pos < mt1->nchars) + memmove (mt1->data + pos_byte + new_bytes, mt1->data + pos_byte, + (mt1->nbytes - pos_unit + 1) * unit_bytes); + memcpy (mt1->data + pos_byte, mt2->data + from_unit * unit_bytes, + new_bytes); + } + else if (mt1->format == MTEXT_FORMAT_UTF_8) { - mt1->allocated = pos_byte + nbytes + 1; - MTABLE_REALLOC (mt1->data, mt1->allocated, MERROR_MTEXT); + unsigned char *p; + int total_bytes, i, c; + + new_units = count_by_utf_8 (mt2, from, to); + total_bytes = mt1->nbytes + new_units; + + if (total_bytes + 1 > mt1->allocated) + { + mt1->allocated = total_bytes + 1; + MTABLE_REALLOC (mt1->data, mt1->allocated, MERROR_MTEXT); + } + p = mt1->data + pos_unit; + memmove (p + new_units, p, mt1->nbytes - pos_unit + 1); + for (i = from; i < to; i++) + { + c = mtext_ref_char (mt2, i); + p += CHAR_STRING_UTF8 (c, p); + } } - memcpy (mt1->data + pos_byte, p, nbytes); - mt1->nbytes = pos_byte + nbytes; - mt1->data[mt1->nbytes] = 0; + else if (mt1->format == default_utf_16) + { + unsigned short *p; + int total_bytes, i, c; - plist = mtext__copy_plist (mt2->plist, from, to, mt1, pos); - if (pos == 0) + new_units = count_by_utf_16 (mt2, from, to); + total_bytes = (mt1->nbytes + new_units) * USHORT_SIZE; + + if (total_bytes + USHORT_SIZE > mt1->allocated) + { + mt1->allocated = total_bytes + USHORT_SIZE; + MTABLE_REALLOC (mt1->data, mt1->allocated, MERROR_MTEXT); + } + p = (unsigned short *) mt1->data + pos_unit; + memmove (p + new_units, p, + (mt1->nbytes - pos_unit + 1) * USHORT_SIZE); + for (i = from; i < to; i++) + { + c = mtext_ref_char (mt2, i); + p += CHAR_STRING_UTF16 (c, p); + } + } + else /* default_utf_32 */ { - if (mt1->plist) - mtext__free_plist (mt1); - mt1->plist = plist; + unsigned int *p; + int total_bytes, i; + + new_units = to - from; + total_bytes = (mt1->nbytes + new_units) * UINT_SIZE; + + if (total_bytes + UINT_SIZE > mt1->allocated) + { + mt1->allocated = total_bytes + UINT_SIZE; + MTABLE_REALLOC (mt1->data, mt1->allocated, MERROR_MTEXT); + } + p = (unsigned *) mt1->data + pos_unit; + memmove (p + new_units, p, + (mt1->nbytes - pos_unit + 1) * UINT_SIZE); + for (i = from; i < to; i++) + *p++ = mtext_ref_char (mt2, i); } - else + + mtext__adjust_plist_for_insert + (mt1, pos, to - from, + mtext__copy_plist (mt2->plist, from, to, mt1, pos)); + mt1->nchars += to - from; + mt1->nbytes += new_units; + if (mt1->cache_char_pos > pos) { - if (pos < mt1->nchars) - mtext__adjust_plist_for_delete (mt1, pos, mt1->nchars - pos); - if (from < to) - mtext__adjust_plist_for_insert (mt1, pos, to - from, plist); + mt1->cache_char_pos += to - from; + mt1->cache_byte_pos += new_units; } - mt1->nchars = pos + (to - from); - if (mt1->nchars < mt1->nbytes) - mt1->format = MTEXT_FORMAT_UTF_8; return mt1; } @@ -331,34 +446,33 @@ count_utf_16_chars (void *data, int nitems, int swap) unsigned short *p = (unsigned short *) data; unsigned short *pend = p + nitems; int nchars = 0; + int prev_surrogate = 0; - while (p < pend) + for (; p < pend; p++) { - unsigned b; + int c = *p; - for (; p < pend; nchars++, p++) + if (swap) + c = SWAP_16 (c); + if (prev_surrogate) { - b = swap ? *p & 0xFF : *p >> 8; - - if (b >= 0xD8 && b < 0xE0) - { - if (b >= 0xDC) - return -1; - break; - } + if (c < 0xDC00 || c >= 0xE000) + return -1; + prev_surrogate = 0; + } + else + { + if (c < 0xD800) + ; + else if (c < 0xDC00) + prev_surrogate = 1; + else if (c < 0xE000) + return -1; + nchars++; } - if (p == pend) - break; - if (p + 1 == pend) - return -1; - p++; - b = swap ? *p & 0xFF : *p >> 8; - if (b < 0xDC || b >= 0xE0) - return -1; - nchars++; - p++; } - + if (prev_surrogate) + return -1; return nchars; } @@ -374,16 +488,12 @@ find_char_forward (MText *mt, int from, int to, int c) while (from < to && STRING_CHAR_ADVANCE_UTF8 (p) != c) from++; } - else if (mt->format <= MTEXT_FORMAT_UTF_16LE) + else if (mt->format <= MTEXT_FORMAT_UTF_16BE) { unsigned short *p = (unsigned short *) (mt->data) + from_byte; if (mt->format == default_utf_16) - { - unsigned short *p = (unsigned short *) (mt->data) + from_byte; - - while (from < to && STRING_CHAR_ADVANCE_UTF16 (p) != c) from++; - } + while (from < to && STRING_CHAR_ADVANCE_UTF16 (p) != c) from++; else if (c < 0x10000) { c = SWAP_16 (c); @@ -406,8 +516,10 @@ find_char_forward (MText *mt, int from, int to, int c) p += ((*p & 0xFF) < 0xD8 || (*p & 0xFF) >= 0xE0) ? 1 : 2; } } + else + from = to; } - else if (c < 0x110000) + else { unsigned *p = (unsigned *) (mt->data) + from_byte; unsigned c1 = c; @@ -468,8 +580,8 @@ find_char_backward (MText *mt, int from, int to, int c) int c1 = (c >> 10) + 0xD800; int c2 = (c & 0x3FF) + 0xDC00; - c1 = SWAP_32 (c1); - c2 = SWAP_32 (c2); + c1 = SWAP_16 (c1); + c2 = SWAP_16 (c2); while (from < to && (p[-1] != c2 || p[-2] != c1)) { to--; @@ -477,7 +589,7 @@ find_char_backward (MText *mt, int from, int to, int c) } } } - else if (c < 0x110000) + else { unsigned *p = (unsigned *) (mt->data) + to_byte; unsigned c1 = c; @@ -753,8 +865,7 @@ mtext__from_data (void *data, int nitems, enum MTextFormat format, int need_copy) { MText *mt; - int nchars = nitems; - int bytes = nitems; + int nchars, nbytes, unit_bytes; if (format == MTEXT_FORMAT_US_ASCII) { @@ -763,46 +874,41 @@ mtext__from_data (void *data, int nitems, enum MTextFormat format, while (p < pend) if (*p++ < 0) MERROR (MERROR_MTEXT, NULL); + nchars = nbytes = nitems; + unit_bytes = 1; } else if (format == MTEXT_FORMAT_UTF_8) { if ((nchars = count_utf_8_chars (data, nitems)) < 0) MERROR (MERROR_MTEXT, NULL); + nbytes = nitems; + unit_bytes = 1; } else if (format <= MTEXT_FORMAT_UTF_16BE) { if ((nchars = count_utf_16_chars (data, nitems, format != default_utf_16)) < 0) MERROR (MERROR_MTEXT, NULL); - bytes = sizeof (short) * nitems; + nbytes = USHORT_SIZE * nitems; + unit_bytes = USHORT_SIZE; } - else if (format <= MTEXT_FORMAT_UTF_32BE) + else /* MTEXT_FORMAT_UTF_32XX */ { - unsigned *p = (unsigned *) data, *pend = p + nitems; - int swap = format != default_utf_32; - - for (; p < pend; p++) - { - unsigned c = swap ? SWAP_32 (*p) : *p; - - if ((c >= 0xD800 && c < 0xE000) || (c >= 0x110000)) - MERROR (MERROR_MTEXT, NULL); - } - bytes = sizeof (unsigned) * nitems; + nchars = nitems; + nbytes = UINT_SIZE * nitems; + unit_bytes = UINT_SIZE; } - else - MERROR (MERROR_MTEXT, NULL); mt = mtext (); mt->format = format; - mt->allocated = need_copy ? bytes : -1; + mt->allocated = need_copy ? nbytes + unit_bytes : -1; mt->nchars = nchars; mt->nbytes = nitems; if (need_copy) { - mt->data = malloc (bytes + 1); - memcpy (mt->data, data, bytes); - mt->data[bytes] = 0; + MTABLE_MALLOC (mt->data, mt->allocated, MERROR_MTEXT); + memcpy (mt->data, data, nbytes); + mt->data[nbytes] = 0; } else mt->data = (unsigned char *) data; @@ -810,79 +916,81 @@ mtext__from_data (void *data, int nitems, enum MTextFormat format, } -/* Not yet implemented. */ - -int +void mtext__adjust_format (MText *mt, enum MTextFormat format) { - if (mt->format == format) - return 0; - if (mt->format == MTEXT_FORMAT_US_ASCII) - { - if (format == MTEXT_FORMAT_UTF_8) - mt->format = MTEXT_FORMAT_UTF_8; - MERROR (MERROR_MTEXT, -1); - } - else if (mt->format == MTEXT_FORMAT_UTF_8) - { - MERROR (MERROR_MTEXT, -1); - } - else if (mt->format <= MTEXT_FORMAT_UTF_16BE) - { - MERROR (MERROR_MTEXT, -1); - } - else - { - MERROR (MERROR_MTEXT, -1); - } - return 0; -} - - -int -mtext__replace (MText *mt, int from, int to, char *from_str, char *to_str) -{ - int from_byte = POS_CHAR_TO_BYTE (mt, from); - int to_byte = POS_CHAR_TO_BYTE (mt, to); - unsigned char *p = MTEXT_DATA (mt) + from_byte; - unsigned char *endp = MTEXT_DATA (mt) + to_byte; - int from_str_len = strlen (from_str); - int to_str_len = strlen (to_str); - int diff = to_str_len - from_str_len; - unsigned char saved_byte; - int pos, pos_byte; - - if (mtext_nchars (mt) == 0 - || from_str_len == 0) - return 0; - M_CHECK_READONLY (mt, -1); - M_CHECK_RANGE (mt, from, to, -1, 0); + int i, c; - saved_byte = *endp; - *endp = '\0'; - while ((p = (unsigned char *) strstr ((char *) p, from_str)) != NULL) - { - if (diff < 0) + if (mt->nchars > 0) + switch (format) + { + case MTEXT_FORMAT_US_ASCII: { - pos_byte = p - MTEXT_DATA (mt); - pos = POS_BYTE_TO_CHAR (mt, pos_byte); - mtext_del (mt, pos, pos - diff); + unsigned char *p = mt->data; + + for (i = 0; i < mt->nchars; i++) + *p++ = mtext_ref_char (mt, i); + mt->nbytes = mt->nchars; + mt->cache_byte_pos = mt->cache_char_pos; + break; } - else if (diff > 0) + + case MTEXT_FORMAT_UTF_8: { - pos_byte = p - MTEXT_DATA (mt); - pos = POS_BYTE_TO_CHAR (mt, pos_byte); - mtext_ins_char (mt, pos, ' ', diff); - /* The above may relocate mt->data. */ - endp += (MTEXT_DATA (mt) + pos_byte) - p; - p = MTEXT_DATA (mt) + pos_byte; + unsigned char *p0, *p1; + + i = count_by_utf_8 (mt, 0, mt->nchars) + 1; + MTABLE_MALLOC (p0, i, MERROR_MTEXT); + mt->allocated = i; + for (i = 0, p1 = p0; i < mt->nchars; i++) + { + c = mtext_ref_char (mt, i); + p1 += CHAR_STRING_UTF8 (c, p1); + } + *p1 = '\0'; + free (mt->data); + mt->data = p0; + mt->nbytes = p1 - p0; + mt->cache_char_pos = mt->cache_byte_pos = 0; + break; } - memmove (p, to_str, to_str_len); - p += to_str_len; - endp += diff; - } - *endp = saved_byte; - return 0; + + default: + if (format == default_utf_16) + { + unsigned short *p0, *p1; + + i = (count_by_utf_16 (mt, 0, mt->nchars) + 1) * USHORT_SIZE; + MTABLE_MALLOC (p0, i, MERROR_MTEXT); + mt->allocated = i; + for (i = 0, p1 = p0; i < mt->nchars; i++) + { + c = mtext_ref_char (mt, i); + p1 += CHAR_STRING_UTF16 (c, p1); + } + *p1 = 0; + free (mt->data); + mt->data = (unsigned char *) p0; + mt->nbytes = p1 - p0; + mt->cache_char_pos = mt->cache_byte_pos = 0; + break; + } + else + { + unsigned int *p; + + mt->allocated = (mt->nchars + 1) * UINT_SIZE; + MTABLE_MALLOC (p, mt->allocated, MERROR_MTEXT); + for (i = 0; i < mt->nchars; i++) + p[i] = mtext_ref_char (mt, i); + p[i] = 0; + free (mt->data); + mt->data = (unsigned char *) p; + mt->nbytes = mt->nchars; + mt->cache_byte_pos = mt->cache_char_pos; + } + } + mt->format = format; } @@ -1110,32 +1218,9 @@ mtext () MText * mtext_from_data (void *data, int nitems, enum MTextFormat format) { - if (nitems < 0) + if (nitems < 0 + || format < MTEXT_FORMAT_US_ASCII || format >= MTEXT_FORMAT_MAX) MERROR (MERROR_MTEXT, NULL); - if (nitems == 0) - { - if (format == MTEXT_FORMAT_US_ASCII - || format == MTEXT_FORMAT_UTF_8) - { - unsigned char *p = data; - - while (*p++) nitems++; - } - else if (format <= MTEXT_FORMAT_UTF_16BE) - { - unsigned short *p = data; - - while (*p++) nitems++; - } - else if (format <= MTEXT_FORMAT_UTF_32BE) - { - unsigned *p = data; - - while (*p++) nitems++; - } - else - MERROR (MERROR_MTEXT, NULL); - } return mtext__from_data (data, nitems, format, 0); } @@ -1192,33 +1277,28 @@ mtext_ref_char (MText *mt, int pos) { unsigned char *p = mt->data + POS_CHAR_TO_BYTE (mt, pos); - c = STRING_CHAR (p); + c = STRING_CHAR_UTF8 (p); } else if (mt->format <= MTEXT_FORMAT_UTF_16BE) { unsigned short *p = (unsigned short *) (mt->data) + POS_CHAR_TO_BYTE (mt, pos); + unsigned short p1[2]; - if (mt->format == default_utf_16) - c = STRING_CHAR_UTF16 (p); - else + if (mt->format != default_utf_16) { - c = (*p >> 8) | ((*p & 0xFF) << 8); - if (c >= 0xD800 && c < 0xE000) - { - int c1 = (p[1] >> 8) | ((p[1] & 0xFF) << 8); - c = ((c - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; - } + p1[0] = SWAP_16 (*p); + if (p1[0] >= 0xD800 || p1[0] < 0xDC00) + p1[1] = SWAP_16 (p[1]); + p = p1; } + c = STRING_CHAR_UTF16 (p); } else { - unsigned *p = (unsigned *) (mt->data) + POS_CHAR_TO_BYTE (mt, pos); - - if (mt->format == default_utf_32) - c = *p; - else - c = SWAP_32 (*p); + c = ((unsigned *) (mt->data))[pos]; + if (mt->format != default_utf_32) + c = SWAP_32 (c); } return c; } @@ -1255,45 +1335,77 @@ mtext_ref_char (MText *mt, int pos) int mtext_set_char (MText *mt, int pos, int c) { - int byte_pos; - int bytes_old, bytes_new; + int pos_unit; + int old_units, new_units; int delta; - unsigned char str[MAX_UTF8_CHAR_BYTES]; unsigned char *p; - int i; + int unit_bytes; M_CHECK_POS (mt, pos, -1); M_CHECK_READONLY (mt, -1); - byte_pos = POS_CHAR_TO_BYTE (mt, pos); - p = mt->data + byte_pos; - bytes_old = CHAR_BYTES_AT (p); - bytes_new = CHAR_STRING (c, str); - delta = bytes_new - bytes_old; + mtext__adjust_plist_for_change (mt, pos, pos + 1); + + if (mt->format <= MTEXT_FORMAT_UTF_8) + { + if (c >= 0x80) + mt->format = MTEXT_FORMAT_UTF_8; + } + else if (mt->format <= MTEXT_FORMAT_UTF_16BE) + { + if (c >= 0x110000) + mtext__adjust_format (mt, MTEXT_FORMAT_UTF_8); + else if (mt->format != default_utf_16) + mtext__adjust_format (mt, default_utf_16); + } + else if (mt->format != default_utf_32) + mtext__adjust_format (mt, default_utf_32); - /* mtext__adjust_plist_for_change (mt, pos, pos + 1);*/ + unit_bytes = UNIT_BYTES (mt->format); + pos_unit = POS_CHAR_TO_BYTE (mt, pos); + p = mt->data + pos_unit * unit_bytes; + old_units = CHAR_UNITS_AT (mt, p); + new_units = CHAR_UNITS (c, mt->format); + delta = new_units - old_units; if (delta) { - int byte_pos_old = byte_pos + bytes_old; - int byte_pos_new = byte_pos + bytes_new; - if (mt->cache_char_pos > pos) mt->cache_byte_pos += delta; - if ((mt->allocated - mt->nbytes) <= delta) + if ((mt->nbytes + delta + 1) * unit_bytes > mt->allocated) { - mt->allocated = mt->nbytes + delta + 1; + mt->allocated = (mt->nbytes + delta + 1) * unit_bytes; MTABLE_REALLOC (mt->data, mt->allocated, MERROR_MTEXT); } - memmove (mt->data + byte_pos_old, mt->data + byte_pos_new, - mt->nbytes - byte_pos_old); + memmove (mt->data + (pos_unit + new_units) * unit_bytes, + mt->data + (pos_unit + old_units) * unit_bytes, + (mt->nbytes - pos_unit - old_units + 1) * unit_bytes); mt->nbytes += delta; - mt->data[mt->nbytes] = 0; + mt->data[mt->nbytes * unit_bytes] = 0; + } + switch (mt->format) + { + case MTEXT_FORMAT_US_ASCII: + mt->data[pos_unit] = c; + break; + case MTEXT_FORMAT_UTF_8: + { + unsigned char *p = mt->data + pos_unit; + CHAR_STRING_UTF8 (c, p); + break; + } + default: + if (mt->format == default_utf_16) + { + unsigned short *p = (unsigned short *) mt->data + pos_unit; + + CHAR_STRING_UTF16 (c, p); + } + else + ((unsigned *) mt->data)[pos_unit] = c; } - for (i = 0; i < bytes_new; i++) - mt->data[byte_pos + i] = str[i]; return 0; } @@ -1326,28 +1438,63 @@ mtext_set_char (MText *mt, int pos, int c) MText * mtext_cat_char (MText *mt, int c) { - unsigned char buf[MAX_UTF8_CHAR_BYTES]; - int nbytes; - int total_bytes; + int nunits; + int unit_bytes = UNIT_BYTES (mt->format); M_CHECK_READONLY (mt, NULL); if (c < 0 || c > MCHAR_MAX) return NULL; - nbytes = CHAR_STRING (c, buf); + mtext__adjust_plist_for_insert (mt, mt->nchars, 1, NULL); - total_bytes = mt->nbytes + nbytes; + if (c >= 0x80 + && (mt->format == MTEXT_FORMAT_US_ASCII + || (c >= 0x10000 + && (mt->format == MTEXT_FORMAT_UTF_16LE + || mt->format == MTEXT_FORMAT_UTF_16BE)))) - mtext__adjust_plist_for_insert (mt, mt->nchars, 1, NULL); + { + mtext__adjust_format (mt, MTEXT_FORMAT_UTF_8); + unit_bytes = 1; + } + else if (mt->format >= MTEXT_FORMAT_UTF_32LE) + { + if (mt->format != default_utf_32) + mtext__adjust_format (mt, default_utf_32); + } + else if (mt->format >= MTEXT_FORMAT_UTF_16LE) + { + if (mt->format != default_utf_16) + mtext__adjust_format (mt, default_utf_16); + } - if (total_bytes >= mt->allocated) + nunits = CHAR_UNITS (c, mt->format); + if ((mt->nbytes + nunits + 1) * unit_bytes > mt->allocated) { - mt->allocated = total_bytes + 1; + mt->allocated = (mt->nbytes + nunits + 1) * unit_bytes; MTABLE_REALLOC (mt->data, mt->allocated, MERROR_MTEXT); } - memcpy (mt->data + mt->nbytes, buf, nbytes); - mt->nbytes = total_bytes; + + if (mt->format <= MTEXT_FORMAT_UTF_8) + { + unsigned char *p = mt->data + mt->nbytes; + p += CHAR_STRING_UTF8 (c, p); + *p = 0; + } + else if (mt->format == default_utf_16) + { + unsigned short *p = (unsigned short *) mt->data + mt->nbytes; + p += CHAR_STRING_UTF16 (c, p); + *p = 0; + } + else + { + unsigned *p = (unsigned *) mt->data + mt->nbytes; + *p++ = c; + *p = 0; + } + mt->nchars++; - mt->data[total_bytes] = 0; + mt->nbytes += nunits; return mt; } @@ -1380,7 +1527,16 @@ mtext_cat_char (MText *mt, int c) MText * mtext_dup (MText *mt) { - return copy (mtext (), 0, mt, 0, mt->nchars); + MText *new = mtext (); + int unit_bytes = UNIT_BYTES (mt->format); + + *new = *mt; + new->allocated = (mt->nbytes + 1) * unit_bytes; + MTABLE_MALLOC (new->data, new->allocated, MERROR_MTEXT); + memcpy (new->data, mt->data, new->allocated); + if (mt->plist) + new->plist = mtext__copy_plist (mt->plist, 0, mt->nchars, new, 0); + return new; } /*=*/ @@ -1416,7 +1572,7 @@ mtext_cat (MText *mt1, MText *mt2) { M_CHECK_READONLY (mt1, NULL); - return copy (mt1, mt1->nchars, mt2, 0, mt2->nchars); + return insert (mt1, mt1->nchars, mt2, 0, mt2->nchars); } @@ -1465,7 +1621,7 @@ mtext_ncat (MText *mt1, MText *mt2, int n) M_CHECK_READONLY (mt1, NULL); if (n < 0) MERROR (MERROR_RANGE, NULL); - return copy (mt1, mt1->nchars, mt2, 0, mt2->nchars < n ? mt2->nchars : n); + return insert (mt1, mt1->nchars, mt2, 0, mt2->nchars < n ? mt2->nchars : n); } @@ -1502,7 +1658,8 @@ MText * mtext_cpy (MText *mt1, MText *mt2) { M_CHECK_READONLY (mt1, NULL); - return copy (mt1, 0, mt2, 0, mt2->nchars); + mtext_del (mt1, 0, mt1->nchars); + return insert (mt1, 0, mt2, 0, mt2->nchars); } /*=*/ @@ -1550,7 +1707,8 @@ mtext_ncpy (MText *mt1, MText *mt2, int n) M_CHECK_READONLY (mt1, NULL); if (n < 0) MERROR (MERROR_RANGE, NULL); - return (copy (mt1, 0, mt2, 0, mt2->nchars < n ? mt2->nchars : n)); + mtext_del (mt1, 0, mt1->nchars); + return insert (mt1, 0, mt2, 0, mt2->nchars < n ? mt2->nchars : n); } /*=*/ @@ -1592,10 +1750,12 @@ mtext_ncpy (MText *mt1, MText *mt2, int n) MText * mtext_duplicate (MText *mt, int from, int to) { - MText *new = mtext (); + MText *new; M_CHECK_RANGE (mt, from, to, NULL, new); - return copy (new, 0, mt, from, to); + new = mtext (); + new->format = mt->format; + return insert (new, 0, mt, from, to); } /*=*/ @@ -1641,8 +1801,9 @@ mtext_copy (MText *mt1, int pos, MText *mt2, int from, int to) { M_CHECK_POS_X (mt1, pos, NULL); M_CHECK_READONLY (mt1, NULL); - M_CHECK_RANGE (mt2, from, to, NULL, mt1); - return copy (mt1, pos, mt2, from, to); + M_CHECK_RANGE_X (mt2, from, to, NULL); + mtext_del (mt1, pos, mt1->nchars); + return insert (mt1, pos, mt2, from, to); } /*=*/ @@ -1683,6 +1844,7 @@ int mtext_del (MText *mt, int from, int to) { int from_byte, to_byte; + int unit_bytes = UNIT_BYTES (mt->format); M_CHECK_READONLY (mt, -1); M_CHECK_RANGE (mt, from, to, -1, 0); @@ -1702,7 +1864,9 @@ mtext_del (MText *mt, int from, int to) } mtext__adjust_plist_for_delete (mt, from, to - from); - memmove (mt->data + from_byte, mt->data + to_byte, mt->nbytes - to_byte + 1); + memmove (mt->data + from_byte * unit_bytes, + mt->data + to_byte * unit_bytes, + (mt->nbytes - to_byte + 1) * unit_bytes); mt->nchars -= (to - from); mt->nbytes -= (to_byte - from_byte); mt->cache_char_pos = from; @@ -1748,35 +1912,12 @@ mtext_del (MText *mt, int from, int to) int mtext_ins (MText *mt1, int pos, MText *mt2) { - int byte_pos; - int total_bytes; - M_CHECK_READONLY (mt1, -1); M_CHECK_POS_X (mt1, pos, -1); if (mt2->nchars == 0) return 0; - mtext__adjust_plist_for_insert - (mt1, pos, mt2->nchars, - mtext__copy_plist (mt2->plist, 0, mt2->nchars, mt1, pos)); - - total_bytes = mt1->nbytes + mt2->nbytes; - if (total_bytes >= mt1->allocated) - { - mt1->allocated = total_bytes + 1; - MTABLE_REALLOC (mt1->data, mt1->allocated, MERROR_MTEXT); - } - byte_pos = POS_CHAR_TO_BYTE (mt1, pos); - if (mt1->cache_char_pos > pos) - { - mt1->cache_char_pos += mt2->nchars; - mt1->cache_byte_pos += mt2->nbytes; - } - memmove (mt1->data + byte_pos + mt2->nbytes, mt1->data + byte_pos, - mt1->nbytes - byte_pos + 1); - memcpy (mt1->data + byte_pos, mt2->data, mt2->nbytes); - mt1->nbytes += mt2->nbytes; - mt1->nchars += mt2->nchars; + insert (mt1, pos, mt2, 0, mt2->nchars); return 0; } @@ -1815,9 +1956,9 @@ mtext_ins (MText *mt1, int pos, MText *mt2) int mtext_ins_char (MText *mt, int pos, int c, int n) { - int byte_pos; - int nbytes, total_bytes; - unsigned char *buf; + int nunits; + int unit_bytes = UNIT_BYTES (mt->format); + int pos_unit; int i; M_CHECK_READONLY (mt, -1); @@ -1827,26 +1968,64 @@ mtext_ins_char (MText *mt, int pos, int c, int n) if (n <= 0) return 0; mtext__adjust_plist_for_insert (mt, pos, n, NULL); - buf = alloca (MAX_UTF8_CHAR_BYTES * n); - for (i = 0, nbytes = 0; i < n; i++) - nbytes += CHAR_STRING (c, buf + nbytes); - total_bytes = mt->nbytes + nbytes; - if (total_bytes >= mt->allocated) + + if (c >= 0x80 + && (mt->format == MTEXT_FORMAT_US_ASCII + || (c >= 0x10000 && (mt->format == MTEXT_FORMAT_UTF_16LE + || mt->format == MTEXT_FORMAT_UTF_16BE)))) + { + mtext__adjust_format (mt, MTEXT_FORMAT_UTF_8); + unit_bytes = 1; + } + else if (mt->format >= MTEXT_FORMAT_UTF_32LE) + { + if (mt->format != default_utf_32) + mtext__adjust_format (mt, default_utf_32); + } + else if (mt->format >= MTEXT_FORMAT_UTF_16LE) + { + if (mt->format != default_utf_16) + mtext__adjust_format (mt, default_utf_16); + } + + nunits = CHAR_UNITS (c, mt->format); + if ((mt->nbytes + nunits * n + 1) * unit_bytes > mt->allocated) { - mt->allocated = total_bytes + 1; + mt->allocated = (mt->nbytes + nunits * n + 1) * unit_bytes; MTABLE_REALLOC (mt->data, mt->allocated, MERROR_MTEXT); } - byte_pos = POS_CHAR_TO_BYTE (mt, pos); + pos_unit = POS_CHAR_TO_BYTE (mt, pos); if (mt->cache_char_pos > pos) { - mt->cache_char_pos++; - mt->cache_byte_pos += nbytes; + mt->cache_char_pos += n; + mt->cache_byte_pos += nunits + n; + } + memmove (mt->data + (pos_unit + nunits * n) * unit_bytes, + mt->data + pos_unit * unit_bytes, + (mt->nbytes - pos_unit + 1) * unit_bytes); + if (mt->format <= MTEXT_FORMAT_UTF_8) + { + unsigned char *p = mt->data + pos_unit; + + for (i = 0; i < n; i++) + p += CHAR_STRING_UTF8 (c, p); + } + else if (mt->format == default_utf_16) + { + unsigned short *p = (unsigned short *) mt->data + pos_unit; + + for (i = 0; i < n; i++) + p += CHAR_STRING_UTF16 (c, p); + } + else + { + unsigned *p = (unsigned *) mt->data + pos_unit; + + for (i = 0; i < n; i++) + *p++ = c; } - memmove (mt->data + byte_pos + nbytes, mt->data + byte_pos, - mt->nbytes - byte_pos + 1); - memcpy (mt->data + byte_pos, buf, nbytes); - mt->nbytes += nbytes; mt->nchars += n; + mt->nbytes += nunits * n; return 0; } @@ -2255,7 +2434,7 @@ mtext_tok (MText *mt, MText *delim, int *pos) return NULL; *pos = pos2 + span (mt, delim, pos2, Mt); - return (copy (mtext (), 0, mt, pos2, *pos)); + return (insert (mtext (), 0, mt, pos2, *pos)); } /*=*/ @@ -2297,9 +2476,7 @@ mtext_text (MText *mt1, int pos, MText *mt2) int use_memcmp = (mt1->format == mt2->format || (mt1->format < MTEXT_FORMAT_UTF_8 && mt2->format == MTEXT_FORMAT_UTF_8)); - int unit_bytes = (mt1->format <= MTEXT_FORMAT_UTF_8 ? 1 - : mt1->format <= MTEXT_FORMAT_UTF_16BE ? 2 - : 4); + int unit_bytes = UNIT_BYTES (mt1->format); if (nbytes2 > pos_byte + nbytes1) return -1;