From: ntakahas Date: Wed, 4 Jul 2007 05:16:55 +0000 (+0000) Subject: Rewritten for Unicode 5.0 with new algorithm. X-Git-Tag: REL-1-4-0~16 X-Git-Url: http://git.chise.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d2694d73d256f93cb098c523b95afbd8dd0f2b35;p=m17n%2Fm17n-db.git Rewritten for Unicode 5.0 with new algorithm. --- diff --git a/FLT/BENG-OTF.flt b/FLT/BENG-OTF.flt index fe76ba1..c587cde 100644 --- a/FLT/BENG-OTF.flt +++ b/FLT/BENG-OTF.flt @@ -23,463 +23,194 @@ ;;;
  • BENG-OTF.flt ;;; -;;; For Bengali OpenType fonts to draw the Bengali script. Tested with -;;; MuktiNarrow.ttf -;;; and -;;; LikhanNormal.otf +;;; For Bengali OpenType fonts to draw the Bengali script. (font layouter beng-otf nil (font (nil nil unicode-bmp :otf=beng=rphf))) (category - ;; C: consonant (excluding B, Y and R) - ;; B: consonant BA (below) - ;; Y: consonant YA (post) - ;; R: consonant RA (reph, below) + ;; X: generic + ;; V: independent vowel + ;; C: consonant (except for R, B and Y) + ;; R: consonant RA + ;; B: consonant BA + ;; Y: consonant YA + ;; T: KHANDA TA ;; n: NUKTA ;; H: HALANT - ;; m: MATRA (pre) - ;; b: MATRA (below) - ;; p: MATRA (post) - ;; t: MATRA (two-part) + ;; m: vowel sign (pre) + ;; b: vowel sign (below) + ;; p: vowel sign (post) ;; A: vowel modifier (above) ;; a: vowel modifier (post) - ;; V: independent vowel + ;; Z: internal use ;; N: ZWNJ (ZERO WIDTH NON-JOINER) ;; J: ZWJ (ZERO WIDTH JOINER) - ;; E: ELSE - ;; - (0x200C ?N) ; ZWNJ - (0x200D ?J) ; ZWJ - (0x0964 0x0965 ?E) ; DANDA, DOUBLE DANDA - (0x0980 0x09FF ?E) ; ELSE - (0x0981 ?A) ; SIGN CANDRABINDU (above) - (0x0982 0x0983 ?a) ; SIGN ANUSWAR, VISARGA (post) - (0x0985 0x098C ?V) ; LETTER A .. VOCALIC L - (0x098F 0x0990 ?V) ; LETTER E .. AI - (0x0993 0x0994 ?V) ; LETTER O .. AU + (0x0980 0x09FF ?X) ; generic + (0x0980 ?Z) ; internal use + (0x0981 ?A) ; SIGN CANDRABINDU + (0x0982 0x0983 ?a) ; SIGN ANUSVARA .. VISARGA + (0x0985 0x0994 ?V) ; LETTER A .. AU (0x0995 0x09B9 ?C) ; LETTER KA .. HA (0x09AC ?B) ; LETTER BA (0x09AF ?Y) ; LETTER YA (0x09B0 ?R) ; LETTER RA (0x09BC ?n) ; SIGN NUKTA - (0x09BE ?p) ; VOWEL SIGN AA (post) - (0x09BF ?m) ; VOWEL SIGN I (pre) - (0x09C0 ?p) ; VOWEL SIGN II (post) - (0x09C1 0x09C4 ?b) ; VOWEL SIGN U, UU, R, RR (below) - (0x09C7 0x09C8 ?m) ; VOWEL SIGN E, AI (pre) - (0x09CB 0x09CC ?t) ; VOWEL SIGN O, AU (two-part) - (0x09CD ?H) ; SIGN VIRAMA (HASANT) - (0x09CE ?C) ; LETTER KHANDA TA + (0x09BE ?p) ; VOWEL SIGN AA + (0x09BF ?m) ; VOWEL SIGN I + (0x09C0 ?p) ; VOWEL SIGN II + (0x09C1 0x09C4 ?b) ; VOWEL SIGN U .. RR + (0x09C7 0x09C8 ?m) ; VOWEL SIGN E .. AI + (0x09CD ?H) ; SIGN VIRAMA + (0x09CE ?T) ; LETTER KHANDA TA (0x09D7 ?p) ; AU LENGTH MARK - (0x09DC 0x09DD ?C) ; LETTER RRA, RHA - (0x09DF ?C) ; LETTER YYA + (0x09DC 0x09DF ?C) ; LETTER RRA .. YYA (0x09E0 0x09E1 ?V) ; LETTER VOCALIC RR, LL - (0x09E2 0x09E3 ?b) ; VOWEL SIGN L .. LL (below) - (0x09F0 0x09F1 ?C) ; LETTER RR', RR'' (assamese) - (0x09FE ?x) ; mark #1 (internal use) - (0x09FF ?y) ; mark #2 (internal use) + (0x09E2 0x09E3 ?b) ; VOWEL SIGN L .. LL + (0x09F0 0x09F1 ?C) ; LETTER RR WITH MIDDLE/LOWER DIAGONAL + (0x0964 0x0965 ?X) ; DANDA .. DOUBLE DANDA + (0x200C ?N) ; ZWNJ + (0x200D ?J) ; ZWJ ) -;; Step 1 : Syllable identification. Recognised syllables are quoted -;; by the pseudo character, which is generated by the command "|" and -;; has the category " " (space). +;; Change consonant (generator (0 (cond - ;; Special case. The sequence "C1 H N C2 m" is reordered as - ;; "C1 H m C2", not "m C1 H C2". Besides, "C1 H" is drawn in the - ;; halant-form. - ("([CRBY]n?H)N" - < | - (1 otf:beng=hln) - | >) - - ;; Case A-C are for those syllables that end with an explicit vowel - ;; mark and/or a vowel modifier. They are divided into three cases - ;; for the readability of regular expression. The leading - ;; consonant-Hasant repetition is analysed for reordering in the - ;; next step. Two-part vowel, if any, is split for - ;; canonicalisation. - - ;; Case A : A syllable ending with a vowel modifier. - ;;1 23 4 5 6 7 - ("(RH)?(([CRBY]n?HJ?)*([CRBY]n?))([mbp]*)(t)?([Aa])" - < | - (1 = =) - (2 set-marks) - (5 = *) - (6 split) - (7 =) - | >) - - ;; Case B : A syllable ending with a two-part vowel. - ;;1 23 4 5 - ("(RH)?(([CRBY]n?HJ?)*([CRBY]n?))(t)" - < | - (1 = =) - (2 set-marks) - (5 split) - | >) - - ;; Case C : A syllable ending with other vowel. Note that a - ;; two-part vowel may be expressed with two vowel marks for - ;; backward compatibility. - ;;1 23 4 5 - ("(RH)?(([CRBY]n?HJ?)*([CRBY]n?))([mbp]+)" - < | - (1 = =) - (2 set-marks) - (5 = *) - | >) - - ;; Case D : Ya-phalaa. Reorder H and Y for the next step. - ;; The web page "Unicode FAQ for Indic Scripts and Languages" - ;; says "it should be - ;; permissible for the Ya-phalla to be consistently formed by "ZWNJ - ;; + VIRAMA + YA". - ("([CRBY]n?N)(H)(Y)" - < | - (1 = *) - (3 =) - (2 =) - | >) - - ;; Case E : No explicit vowel nor modifier. If the syllable ends - ;; with a consonant, analyse it for reordering in the next step. - ;; Otherwise, just identify the syllable without changing anything. - ;;1 23 4 - ("(RH)?(([CRBY]n?HJ?)*[CRBY]n?)(HN|HJ|H)?" - < | - (1 = =) - (2 set-marks) - (4 = *) - | >) + ;; Decompose two-part vowel signs. + ((0x09CB) + 0x09C7 0x09BE) + ((0x09CC) + 0x09C7 0x09D7) + ;; TA + HALANT + ZWJ -> KHANDA-TA + ((0x09A4 0x09CD 0x200D) + 0x09CE) + ;; consonant + NUKTA + ((0x09A1 0x09BC) + 0x09DC) + ((0x09A2 0x09BC) + 0x09DD) + ((0x09AF 0x09BC) + 0x09DF) + ("." =)) + *)) - ;; Case F : Syllables that begin with an independent vowel. An - ;; optional HYp sequence appears when this syllable represents the - ;; sound "a" in English "bat" (see the FAQ above). If it appears, - ;; we reorder the H and Y for the next step. - ("(V)(HYp)?([aA])?" - < | (1 =) (2 ("HY(p)" 0x09AF 0x09CD (1 =))) (3 =) | >) +;; Syllable identification and reordering. +(generator + (0 + (cond + ;; Khanda-Ta + ("(RH)?(T)" + < | (2 =) (1 otf:beng=rphf+) | >) + + ;; Standalone Ya-phalaa + ("JHY" + < | post | >) + + ;; A syllable with a pre-base vowel sign. + ;;1 2 3 4 5 6 7 8 9 + ("(RH)?([CRBY]n?(HCn?)*(H[RB])?(J?HY)?)(m)(p)?(A)?(a)?" + < | (6 =) (2 pre-below) (1 otf:beng=rphf+) (8 =) (2 post) (7 =) (9 =) | >) + + ;; A syllable with a non-pre-base vowel sign. + ;;1 2 3 4 5 6 78 9 10 11 + ("(RH)?([CRBY]n?(HCn?)*(H[RB])?(J?HY)?)(N)?J?((b)|(p))(A)?(a)?" + < | (6 =) (2 pre-below) (8 =) (1 otf:beng=rphf+) (10 =) + (2 post) (9 =) (11 =) | >) + + ;; A syllable with a vowel modifier and no vowel signs. + ;;1 2 3 4 5 67 8 + ("(RH)?([CRBY]n?(HCn?)*(H[RB])?(J?HY)?)((A)|(a))" + < | (2 pre-below) (1 otf:beng=rphf+) (7 =) (2 post) (8 =) | >) + + ;; A syllable ending with a halant. + ;;1 2 3 4 5 6 + ("(RH)?([CRBY]n?(HCn?)*(H[RB])?(J?HY)?)(H)?N?" + < | (2 pre-below) (6 =) (1 otf:beng=rphf+) (2 post) | >) + + ;; A syllable starting with an independent vowel. + ;;1 2 3 + ("(V)(J?HY)?(A?a?)" + < | (1 =) 0x0980 (2 post) (3 = *) | >) ("." =)) *) - ;; Set mark #1 (x) at the position where below consonants begin, and - ;; mark #2 (y) at the position to which below and above signs will be - ;; moved. - (set-marks + ;; Move a halant after the base consonant to the end. + ;; Put a special mark after the final belew-base consonant. + ;; Remove post-base parts. + (pre-below (cond - ;; Ending with Y. - ;;1 2 3 45 6 - ("([CRBY]n?(HJ?Cn?)*)(H)(([RB]H)*)(Y)" - (1 = *) ; prebase & base - 0x09FE ; mark #1 - (4 = *) ; below consonants - 0x09FF ; mark #2 - (6 =) ; YA - (3 =)) ; moved HASANT - ;; Ending with R or B. - ;;1 2 3 45 - ("([CRBY]n?(HJ?Cn?)*)(H)(([RB]H)*[RB])" - (1 = *) ; prebase & base - 0x09FE ; mark #1 - (4 = *) ; below consonants - (3 =) ; moved HASANT - 0x09FF) ; mark #2 + ("(.+)(H)([RB])(J?HY)?$" + (1 = *) (3 =) (2 =) 0x0980) + ("([^J]+)J?HY$" + (1 = *) 0x0980) (".+" - = * - 0x09FE ; mark #1 - 0x09FF))) ; mark #2 + = * 0x0980))) - ;; Split two-part dependent vowel signs for canonicalisation. - (split + ;; Extract post-base parts and add a halant at the end. + ;; Produce nothing if there are no post-base parts. + (post (cond - ((0x09CB) 0x09C7 0x09BE) - ((0x09CC) 0x09C7 0x09D7))) - ) + (".*(H)(Y)$" + (2 =) (1 =)) + (".+" + )))) -;; Step 2 : Move Reph and Matra if necessary. From now on, we care -;; only for those syllables that have been identified in Step 1. +;; Apply'nukt' and 'akhn'. (generator (0 (cond - ;; Special case: a single consonant and a Halant. - (" (.)xy(HJ?) " - | - 0x09FE - (1 =) - (2 = *) - 0x09FE - |) - - ;; This is the most generic pattern. It follows Cases A, B, C and - ;; E in Step 1. Now Mark #1 is used to indicate the critical part - ;; that requires pre-base substitution in the following steps. - - ;; 1 2 3 4 5 6 7 8 9 10 - (" (RH)?([^ xy]+)x([^ y]*)y(YH)?(m)?(b)?(p)?(A)?(a)?(HJ|H)? " - | - (5 =) ; [Mpre] - ;; Actually, the nukt feature is not necessary for Bengali because - ;; all the necessary Nukta forms are precomposed in the Unicode - ;; standard. Even if a Nukta consonant is given in the form of - ;; the combination of the base consonant and a Nukta sign, we can - ;; safely perform the composition here because it does not affect - ;; surrounding letters in the syllable. The Akhand ligature - ;; operation is also applied here, before applying the half form - ;; operation because the Mukti font generates Akhand ligatures - ;; directly from the "C H C" sequence, not via the half form. - 0x09FE ; begin Cpre & Cbase - (2 otf:beng=nukt,akhn) ; {Cpre + H} + Cbase - 0x09FE ; end Cpre & Cbase - (3 otf:beng=blwf) ; {Cbelow + H} - (6 =) ; [Mbelow] - (1 otf:beng=rphf) ; [Reph] - (8 =) ; [VMabove] - (4 otf:beng=pstf) ; [Cpost + H] - (7 =) ; [Mpost] - (9 =) ; [VMpost] - (10 = *) ; optional HASANT - |) - - ;; Syllables that begin with an independent vowel (following up - ;; Step 1, Case F). If a YH sequence exist, it is changed to the - ;; post-base form. Syllables of this type do not require further - ;; modification. - (" (V)(YH)(.*) " - | - (1 =) - (2 otf:beng=pstf) - (3 = *) - |) - - ;; Ya-phalaa (following up Step 1, Case D). Remove N and change YH - ;; to the post base form. Syllables of this type do not require - ;; further modification. - (" ([CBRY]n?)N(YH) " - | - (1 =) - (2 otf:beng=pstf) - |) - + (" ([^Z]+)(Z[^ ]*) " + | (1 otf:beng=nukt,akhn+) (2 = *) |) ("." =)) *)) -;; Step 3 : Now only those syllables that contain the pseudo character -;; x require pre-base substition. This is the most complicated part -;; in this FLT. - -;; If the sequence "C1 H C2" makes ligature L12, L12 replaces the -;; original sequence. - -;; To test the availability of such a ligature, we try to generate it -;; using the pre-base substitute feature, then see whether succeeded -;; or not. In the case of failure, the pre-base feature does not -;; change the original sequence. - -;; To create a ligature, the "C1 H" part must be first converted into -;; the half form of C1. Creating the half form of a consonant always -;; succeeds. - -;; ligature(half(C1,H),C2) -;; ==> ligature(C1half,C2) -;; ==> L12 ; success -;; C1half C2 ; fail - -;; If the ligature is not available, the "C1 H" part must be converted -;; into the _Halant_ (not half) form of C1. However, there is no way -;; to reconvert C1half into C1halant nor to revert back to "C1 H". -;; Thus we duplicate the critical part in two different forms so that -;; we can select the appropriate one in the next step. The pseudo -;; character x is used to indicate the boundaries. - -;; ... C1 H C2 ... ==> ... x C1halant C2 x L12 x ... - -;; If the length of the L12 part is one, ligature generation was -;; successful. In this case we wipe out the duplicated C1halant and -;; C2. Otherwise we remove L12. - -;; In very few cases (I found only one in the Mukti font), the "C1 H" -;; part need to be converted into C1halant (instead of C1half) to make -;; a ligature with C2. So when we try to generate a ligature form, we -;; apply the GSUB features "half", "haln" and "pres" in this order. - -(category - ;; C: consonant (excluding B, Y and R) - ;; H: HALANT - ;; N: ZWNJ (ZERO WIDTH NON-JOINER) - ;; J: ZWJ (ZERO WIDTH JOINER) - ;; E: ELSE - ;; - (0x200C ?N) ; ZWNJ - (0x200D ?J) ; ZWJ - (0x0964 0x0965 ?E) ; DANDA, DOUBLE DANDA - (0x0980 0x09FF ?E) ; ELSE - (0x09CD ?H) ; SIGN VIRAMA (HASANT) - (0x0995 ?K) ; LETTER KA - (0x09B7 ?S) ; LETTER SSA - (0x09A3 ?M) ; LETTER NNA - (0x09AE ?M) ; LETTER MA - (0x09FE ?x) ; mark #1 (internal use) - ) - +;; Apply 'blwf' and 'pstf' to the concerning parts. (generator (0 (cond - - ;; One pre-base and base. - ;; 1 23 4 5 6 - (" ([^x ]*)x((.H)([^J]))(H)?x([^ ]*) " - | - (1 = *) - 0x09FE ; x - (3 otf:beng=haln) ; C1halant - (4 =) ; C2 - 0x09FE ; x - (2 otf:beng=half,haln,pres) ; ligature result - 0x09FE ; x - (5 =) - (6 = *) - |) - - ;; One pre-base with ZWJ. According to the Unicode FAQ, the half - ;; form is forced in this case. So we fake as if ligature - ;; generation was failed. - (" ([^x ]*)x(.H)J(.)?x([^ ]*) " - | - (1 = *) - 0x09FE ; x - (2 otf:beng=half) ; C1half - (3 =) ; C2 - 0x09FE ; x - 0x09FD ; pseudo result - 0x09FD ; pseudo result - 0x09FE ; x - (4 = *) - |) - - ;; One pre-base possibly with ZWNJ. Similar to above. - (" ([^x ]*)x(.H)N?(.)?x([^ ]*) " - | - (1 = *) - 0x09FE ; x - (2 otf:beng=haln) ; C1halant - (3 =) ; C2 - 0x09FE ; x - 0x09FD ; pseudo result - 0x09FD ; pseudo result - 0x09FE ; x - (4 = *) - |) - - ;; Standalone base. There is nothing more to do. - (" ([^x ]*)x(.)x([^ ]*) " - | - (1 = *) - (2 =) - (3 = *) - |) - - ;; KA-SSA-NNA and KA-SSA-MA are the only pre-base ligatures that - ;; consist of three consonants. - ;; 1 23 4 5 6 7 - (" ([^x ]*)x((KH)(SH)(M))(H)?x([^ ]*) " - | - (1 = *) - 0x09FE ; x - (3 otf:beng=haln) ; KAhalant - (4 otf:beng=haln) ; SSAhalant - (5 =) ; NNA or MA - 0x09FE ; x - (2 otf:beng=half,haln,pres) ; ligature result - 0x09FE ; x - (6 =) - (7 = *) - |) - - ;; Two or more pre-bases plus base. Give up. Convert all - ;; pre-bases into halant form. - ;; 1 23 4 5 - (" ([^x ]*)x(([^x]H[JN]?)+)([^x])?x([^ ]*) " - | - (1 = *) - 0x09FE ; x - (2 force-haln) ; halant forms - (4 =) ; full form - 0x09FE ; x - 0x09FD ; pseudo result - 0x09FD ; pseudo result - 0x09FE ; x - (5 = *) - |) - + (" (N?m?.)([^Z]*)(Z)([^ ]*) " + | (1 = *) (2 otf:beng=blwf+) (3 =) (4 otf:beng=pstf+) |) + (" (YH) " + | (1 otf:beng=pstf+) |) ("." =)) - *) + *)) - ;; This is to remove ZWNJ and ZWJ. The half-form-force-effect of ZWJ - ;; is ignored. Sorry. - (force-haln +;; Get pre-base and below-base conjuncts. +(generator + (0 (cond - ("([^JN]*)[JN](.*)" - (1 otf:beng=haln) - (2 force-haln)) - (".+" - otf:beng=haln))) - ) + (" (N?m?)([^Z]+)(Z)([^ ]*) " + | (1 = *) (2 otf:beng=half,vatu,pres,blws+) (3 =) (4 = *) |) + ("." =)) + *)) -;; Step 4 : Select the appropriate representation. Only those -;; syllables that contain the virtual character x require -;; modification. +;; When the number of glyphs between a pre-base vowel sign and the +;; post-below mark is more than one, move the pre-base vowel sign +;; before the final glyph. (generator (0 (cond - ;; Only one glyph in the ligature section (between the second and - ;; the third x). It means a ligature was successfully generated. - ;; C1halant and C2 (between the first and second x) are removed. - (" ([^x ]*)x[^x]+x(.)x([^ ]*) " - | - (1 = *) - (2 =) - (3 = *) - |) - - ;; Otherwise halant and base forms are used. The failed ligature - ;; is removed. - (" ([^x ]*)x([^x]+)x[^x]+x([^ ]*) " - | - (1 = *) - (2 = *) - (3 = *) - |) - - ;; No need to care the other cases. + (" (N)?(m)([^Z]+)([^Z])Z([^ ]*) " + | (1 =) (3 = *) (2 =) (4 =) (5 = *) |) + (" ([^Z]+)Z([^ ]*) " + | (1 = *) (2 = *) |) ("." =)) *)) -;; Step 5 : Select appropriate glyph variants for fine adjustments. -;; Now the syllable boundary marks are removed so that the final step -;; can find word boundaries. +;; Get matra conjuncts. +;; Do not apply 'blws' to syllables that begins with ZWNJ. (generator (0 (cond + (" N([^ ]+) " + (1 otf:beng=init,pres,abvs,psts,haln)) (" ([^ ]+) " - (1 otf:beng=blws,abvs,psts,vatu)) + (1 otf:beng=init,pres,abvs,blws,psts,haln)) ("." - [ otf:beng=+ ] )) - *) - ) - -;; Step 6 : Word initial substitute. As the syllable boundaries have -;; been eliminated in the previous step, this rule is applied to a run -;; of Bengali glyphs, i.e. word by word. We finally apply the init -;; feature to the word initial gylphs to get the final result. -(generator - (0 - ("(.)(.*)" - (1 otf:beng=init) - (2 = *)))) + [ otf:beng=+ ])) + *)) ;; Local Variables: ;; mode: emacs-lisp