From: ntakahas <ntakahas>
Date: Thu, 28 Jun 2007 05:35:08 +0000 (+0000)
Subject: Rewritten with new algorithm for Unicode 5.0.
X-Git-Tag: REL-1-4-0~20
X-Git-Url: http://git.chise.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=64b644f3f4e02ef86d2fe4147f3b6852db0d0ab9;p=m17n%2Fm17n-db.git

Rewritten with new algorithm for Unicode 5.0.
---

diff --git a/FLT/ORYA-OTF.flt b/FLT/ORYA-OTF.flt
index ebd6ef5..5840962 100644
--- a/FLT/ORYA-OTF.flt
+++ b/FLT/ORYA-OTF.flt
@@ -30,289 +30,202 @@
       (font (nil nil unicode-bmp :otf=orya=rphf)))
 
 (category
- ;; C: consonant (excluding Y and R)
- ;; R: consonant RA (reph, below)
+ ;; C: consonant (except for R, B and Y)
+ ;; R: consonant RA
  ;; B: consonant (below)
- ;; Y: consonant YA, YYA (post)
+ ;; Y: consonant (post)
  ;; n: NUKTA
  ;; H: HALANT
- ;; m: MATRA (pre)
- ;; u: MATRA (above)
- ;; b: MATRA (below)
- ;; p: MATRA (post)
- ;; t: MATRA (two-part)
+ ;; m: vowel sign (pre)
+ ;; u: vowel sign (above)
+ ;; b: vowel sign (below)
+ ;; p: vowel sign (post)
+ ;; t: vowel sign (two-part)
  ;; A: vowel modifier (above)
  ;; a: vowel modifier (post)
  ;; V: independent vowel
  ;; N: ZWNJ (ZERO WIDTH NON-JOINER)
  ;; J: ZWJ (ZERO WIDTH JOINER)
- ;; E: ELSE
- ;;
- (0x200C	?N)			; ZWNJ
- (0x200D	?J)			; ZWJ
- (0x0664 0x0665	?E)			; DANDA, DOUBLE DANDA
- (0x0B00 0x0B7F	?E)			; ELSE
- (0x0B01	?A)			; SIGN CANDRABINDU (above)
- (0x0B02 0x0B03	?a)			; SIGN ANUSWAR, VISARGA (post)
- (0x0B05 0x0B0C	?V)			; LETTER A .. VOCALIC L
- (0x0B0F 0x0B10	?V)			; LETTER E .. AI
- (0x0B13 0x0B14	?V)			; LETTER O .. AU
+ ;; X: generic
+ ;; Z: internal use
+ (0x0B00 0x0B7F ?X)			; generic
+ (0x0B00	?Z)			; internal use
+ (0x0B01 0x0B03	?a)			; SIGN CANDRABINDU .. VISARGA
+ (0x0B05 0x0B14	?V)			; LETTER A .. VOCALIC AU
  (0x0B15 0x0B39	?C)			; LETTER KA .. HA
  (0x0B24	?B)			; LETTER TA
  (0x0B28	?B)			; LETTER NA
- (0x0B2C	?B)			; LETTER BA
- (0x0B2D	?B)			; LETTER BHA
- (0x0B2E	?B)			; LETTER MA
+ (0x0B2C 0x0B2E	?B)			; LETTER BA .. MA
  (0x0B2F	?Y)			; LETTER YA
  (0x0B30	?R)			; LETTER RA
- (0x0B32	?B)			; LETTER LA
+ (0x0B32 0x0B33	?B)			; LETTER LA
  (0x0B33	?B)			; LETTER LLA
- (0x0B35	?B)			; LETTER VA
  (0x0B3C	?n)			; SIGN NUKTA
- (0x0B3E	?p)			; VOWEL SIGN AA (post)
- (0x0B3F	?u)			; VOWEL SIGN I (above)
- (0x0B40	?p)			; VOWEL SIGN II (post)
- (0x0B41 0x0B43	?b)			; VOWEL SIGN U, UU, R (below)
- (0x0B47	?m)			; VOWEL SIGN E (pre)
- (0x0B48 0x0B4C ?t)			; VOWEL SIGN AI, O, AU (two-part)
- (0x0B4D	?H)			; SIGN VIRAMA (HALANT)
+ (0x0B3E	?p)			; VOWEL SIGN AA
+ (0x0B3F	?u)			; VOWEL SIGN I
+ (0x0B40	?p)			; VOWEL SIGN II
+ (0x0B41 0x0B43	?b)			; VOWEL SIGN U .. VOCALIC R
+ (0x0B47	?m)			; VOWEL SIGN E
+ (0x0B48 0x0B4C ?t)			; VOWEL SIGN AI .. AU
+ (0x0B4D	?H)			; SIGN VIRAMA
  (0x0B56	?u)			; AI LENGTH MARK
  (0x0B57	?p)			; AU LENGTH MARK
- (0x0B5C 0x0B5D	?C)			; LETTER RRA, RHA
+ (0x0B5C 0x0B5D	?C)			; LETTER RRA .. RHA
  (0x0B5F	?Y)			; LETTER YYA
- (0x0B60 0x0B61	?V)			; LETTER VOCALIC RR, LL
+ (0x0B60 0x0B61	?V)			; LETTER VOCALIC RR .. LL
  (0x0B71	?C)			; LETTER WA
- (0x0B7E	?x)			; mark #1 (internal use)
- (0x0B7F	?y)			; mark #2 (internal use)
+ (0x0B64 0x0B65	?X)			; DANDA .. DOUBLE DANDA
+ (0x200C	?N)			; ZWNJ
+ (0x200D	?J)			; ZWJ
  )
 
-;; Step 1 : Syllable identification.  Recognised syllables are quoted
-;; by the pseudo character, which is generated by the command "|" and
-;; has the category " " (space).
+;; Decompose two-part vowel signs.
+;; Move ZWJ before the consonant.
 (generator
  (0
   (cond
-   ;; Case F : Syllables containing an independent vowel.
-    ("(RH)?(V)(a)?(A)?"
-    < |
-    (2 =)
-    (1 = =)
-    (3 =)
-    (4 =)
-    | >)
-
-   ;; Case A-C are for those syllables that end with an explicit vowel
-   ;; mark and/or a vowel modifier.  They are divided into three cases
-   ;; for readability of the regular expressions.  The leading
-   ;; consonant-Halant repetition is analysed for reordering in the
-   ;; next step.  A two-part vowel, if any, is split for
-   ;; canonicalisation.
+   ((0x0B48)
+    0x0B47 0x0B56)
+   ((0x0B4B)
+    0x0B47 0x0B3E)
+   ((0x0B4C)
+    0x0B47 0x0B57)
+   ("(Cn?)(J)"
+    (2 =) (1 = *))
+   ("." =))
+  *))
 
-   ;; Case A : A syllable ending with a vowel modifier.
-   ("(RH)?(([CRBY]n?H[NJ]?)*([CRBY]n?))([mbup]*)(t)?([Aa])"
+;; Syllable identification and reordering.
+;; Do not apply 'rphf' if a syllable begins with ZWJ.
+(generator
+ (0
+  (cond
+   ;; A syllable with ZWJ and a pre-base vowel sign.
+   ;;1  23                    4  5   6   7
+   ("(J)(([CRBY]n?H)*[CRBY]n?)(m)(u)?(p)?(a)?"
+    < | (1 =) (4 =) (2 pre-below) (5 =) (2 post) (6 =) (7 =) | >)
+
+   ;; A syllable with ZWJ and a non-pre-base vowel sign.
+   ;;1  23                    45      6   7
+   ("(J)(([CRBY]n?H)*[CRBY]n?)(([bu])|(p))(a)?"
+    < | (1 =) (2 pre-below) (5 =) (2 post) (6 =) (7 =) | >)
+
+   ;; A syllable with ZWJ and a vowel modifier, but without vowel signs.
+   ;;1  23                    4
+   ("(J)(([CRBY]n?H)*[CRBY]n?)(a)"
+    < | (1 =) (2 pre-below) (2 post) (4 =) | >)
+
+   ;; Add a ZWNJ explicitly when a syllable ends with a halant.
+   ;;1  23                    4   5
+   ("(J)(([CRBY]n?H)*[CRBY]n?)(H)?(N)?"
+    < | (1 =) (2 pre-below) (4 = 0x200C) (2 post) | >)
+
+   ;; With a pre-base vowel sign, without a ZWJ.
+   ;;1    23                    4  5   6   7
+   ("(RH)?(([CRBY]n?H)*[CRBY]n?)(m)(u)?(p)?(a)?"
     < |
-    (1 = =)
-    (2 set-marks)
-    (5 = *)
-    (6 split)
-    (7 =)
+    (4 =) (2 pre-below) (5 =) (1 otf:orya=rphf) (2 post) (6 =) (7 =)
     | >)
 
-   ;; Case B : A syllable ending with a two-part vowel.
-   ("(RH)?(([CRBY]n?H[NJ]?)*([CRBY]n?))(t)"
-    < |
-    (1 = =)
-    (2 set-marks)
-    (5 split)
-    | >)
+   ;; With a non-pre-base vowel sign, without a ZWJ.
+   ;; 1   23                    45      6   7
+   ("(RH)?(([CRBY]n?H)*[CRBY]n?)(([bu])|(p))(a)?"
+    < | (2 pre-below) (5 =) (1 otf:orya=rphf) (2 post) (6 =) (7 =) | >)
 
-   ;; Case C : A syllable ending with other vowel(s).  Note that a
-   ;; two-part vowel may be expressed with two vowel marks for
-   ;; backward compatibility.
-   ("(RH)?(([CRBY]n?H[NJ]?)*([CRBY]n?))([mbup]+)"
-    < |
-    (1 = =)
-    (2 set-marks)
-    (5 = *)
-    | >)
+   ;; With a vowel modifier, without vowel signs and a ZWJ.
+   ;;1    23                    4
+   ("(RH)?(([CRBY]n?H)*[CRBY]n?)(a)"
+    < | (2 pre-below) (1 otf:orya=rphf) (2 post) (4 =) | >)
 
-   ;; Case E : No explicit vowel nor modifier.  If the syllable ends
-   ;; with a consonant, analyse it for reordering in the next step.
-   ;; Otherwise, just identify the syllable without changing anything.
-   ;;1    23                         4
-   ("(RH)?(([CRBY]n?H[NJ]?)*[CRBY]n?)(HN|HJ|H)?"
-    < |
-    (1 = =)
-    (2 set-marks)
-    (4 = *)
-    | >)
+   ;; Add a ZWNJ explicitly when a syllable ends with a halant.
+   ;;1    23                    4   5
+   ("(RH)?(([CRBY]n?H)*[CRBY]n?)(H)?(N)?"
+    < | (2 pre-below) (1 otf:orya=rphf) (4 = 0x200C) (2 post) | >)
+
+   ;; A syllable starting with an independent vowel.
+   ("Va?"
+    < | = * | >)
 
    ("." =))
   *)
 
- ;; Set mark #1 (x) at the position where below consonants begin, and
- ;; mark #2 (y) at the position to which below and above signs will be
- ;; moved.
- (set-marks
+ ;; Move a halant after the base consonant to the end.
+ ;; Fill the resulting gap with a special mark.
+ ;; Remove post-base parts.
+ (pre-below
   (cond
-   ;; Ending with Y.
-   ;;1        2            3  45        6
-   ("([CRBY]n?(H[NJ]?Cn?)*)(H)(([RB]H)*)(Y)$"
-    (1 = *)				; prebase & base
-    0x0B7E				; below begin
-    (4 = *)				; below consonants
-    0x0B7F				; below end
-    (6 =)				; YA
-    (3 =))				; moved HALANT
-   ;; Ending with R or B.
-   ;;1        2            3  45
-   ("([CRBY]n?(H[NJ]?Cn?)*)(H)(([RB]H)*[RB])$"
-    (1 = *)				; prebase & base
-    0x0B7E				; below begin
-    (4 = *)				; below consonants 
-    (3 =)				; moved HALANT
-    0x0B7F)				; below end
-   (".+"
-    = *
-    0x0B7E				; below begin
-    0x0B7F)))				; below end
-
- ;; Split two-part dependent vowel signs for canonicalisation.
- (split
+   ("([CRBYnH]*[CYn])H([RBH]+)[YH]+$"
+    (1 = *) 0x0B00 (2 = *))
+   ("([CRBYnH]*[CYn])(H)([RBH]+)$"
+    (1 = *) 0x0B00 (3 = *) (2 =))
+   ("([CRBYnH]*[Cn])[YH]*$"
+    (1 = *) 0x0B00)
+   ("([RB]n?)H([RBH]*)[YH]+$"
+    (1 = *) 0x0B00 (2 = *))
+   ("([RB]n?)(H)([RBH]*)$"
+    (1 = *) 0x0B00 (3 = *) (2 =))
+   ("([RBY]n?)[YH]*$"
+    (1 = *) 0x0B00)))
+
+ ;; Extract post-base parts and add a halant at the end.
+ ;; Produce nothing if there are no post-base parts.
+ (post
   (cond
-   ((0x0B48)	0x0B47 0x0B56)
-   ((0x0B4B)	0x0B47 0x0B3E)
-   ((0x0B4C)	0x0B47 0x0B57)))
+   ("[CRBYnH]*[CRBn]H([YH]+)$"
+    (1 = *) 0x0B4D)
+   ("Yn?H(YH)+$"
+    (1 = *) 0x0B4D)
+   (".+"
+    )))
  )
 
-;; Step 2 : Move Reph and Matra if necessary.  From now on, we care
-;; only for those syllables that have been identified in Step 1.
+;; Apply language forms to concerning segments.
 (generator
  (0
   (cond
-   ;; Special case: a single consonant and a Halant.
-   (" (.)xy(H[NJ]?) "
-    |
-    (1 =)
-    (2 = *)
-    |)
-
-   ;; This is the most generic pattern.  It follows Case A-C and a
-   ;; part of Case E in Step 1.  Now Mark #1 is used to indicate the
-   ;; critical part that requires pre-base substitution in the
-   ;; following steps.
-
-   ;; 1    2         3        4    5   6   7   8   9   10  11
-   (" (RH)?([^ xy]+)x([^ y]*)y(YH)?(m)?(b)?(u)?(p)?(A)?(a)?(HN|HJ|H)? "
-    |
-    (5 =)				; [Mpre]
-    ;; We can safely perform Nukta composition here because it does
-    ;; not affect surrounding letters in the syllable.  The Akhand
-    ;; ligature operation is also applied here, before applying the
-    ;; half form operation because the Utkal font generates Akhand
-    ;; ligatures directly from the "C H C" sequence, not via the half
-    ;; form.
-    0x0B7E				; begin Cpre & Cbase
-    (2 otf:orya=nukt,akhn+)		; {Cpre + H} + Cbase
-    0x0B7E				; end Cpre & Cbase
-    (3 otf:orya=blwf+)			; {Cbelow + H}
-    (6 =)				; [Mbelow]
-    (7 =)				; [Mabove]
-    (1 otf:orya=rphf+)			; [Reph]
-    (4 otf:orya=pstf+)			; [Cpost + H]
-    (8 =)				; [Mpost]
-    (9 =)				; [VMabove]
-    (10 =)				; [VMpost]
-    (11 = *)				; optional HALANT
-    |)
+   ;; If a syllable contains a ZWNJ, render the preceding halant explicitly.
+   (" ([^Z]+)(Z)([^N]*)(HN)([^ ]*) "
+    | (1 otf:orya=nukt,akhn,half+) (2 =) (3 otf:orya=blwf+) (4 = =)
+    (5 otf:orya=pstf+) |)
 
-   ;; Syllables that begin with an independent vowel (following up
-   ;; Step 1, Case F).  Syllables of this type do not require further
-   ;; modification.
-   (" (V)(RH)(.*) "
-    |
-    (1 =)
-    (2 otf:orya=rphf+)
-    (3 = *)
-    |)
+   (" (J?m?)([^Z]+)(Z)([^ ]*) "
+    | (1 = *) (2 otf:orya=nukt,akhn,half+) (3 =) (4 otf:orya=blwf,pstf+) |)
 
    ("." =))
   *))
 
-;; Step 3 : Now only those syllables that contain the pseudo character
-;; x require pre-base substition.  Unlike the Mukti font for Bengali,
-;; the Utkal font can produce the ligature for "C1 H C2" from
-;; "C1halant" and "C2".  If such a ligature is not available, we get a
-;; sequence consisting of "C1halant" and "C2", which is satisfactory.
-
+;; Apply 'pres' to get pre-base conjuncts.
 (generator
  (0
   (cond
-   (" (.H)J "
-    |
-    (1 otf:orya=half+)
-    |)
-   (" (.H)N? "
-    |
-    (1 otf:orya=haln+)
-    |)
-   (" ([^x ]?x)([^x ]*)(x[^ ]*) "
-    |
-    (1 = *)
-    (2 pres)
-    (3 = *)
-    |)
+   (" (J?m?)([^Z]+)(Z)([^ ]*) "
+    | (1 = *) (2 otf:orya=pres+) (3 =) (4 = *) |)
    ("." =))
-  *)
-
- (pres
-  (cond
-   ("([^NJ]*)(.H)J(.*)"
-    (1 otf:orya=haln,pres+)
-    (2 otf:orya=half+)
-    (3 pres))
-   ("([^N]*)(H)N(.*)"
-    (1 otf:orya=haln,pres+)
-    (2 =)
-    (3 pres))
-   (".*"
-    otf:orya=haln,pres+)))
-    
- )
-
-;; Step 4 : Mpre/Cpre reordering.  If the pre-base substitution in
-;; the previous step results in more than one glyph, and there is an
-;; Mpre in this syllable, then move the Mpre before the Cbase.
-;; i.e. [Mpre]{Kh}Kf... -> {Kh}[Mpre]Kf...
+  *))
 
+;; When the number of glyphs between a pre-base vowel sign and the
+;; post-base mark is more than one, move the pre-base vowel sign
+;; before the base glyph.
 (generator
  (0
   (cond
-   (" ([^x ])x([^x ]+)([^x ])x([^x ]*) "
-    |
-    (2 = *)
-    (1 =)
-    (3 =)
-    (4 = *)
-    |)
-   (" ([^x ])?x([^x ]*)x([^ ]*) "
-    |
-    (1 =)
-    (2 = *)
-    (3 = *)
-    |)
+   (" (J)?(m)([^Z]+)([^Z])Z([^N ]*)N?([^ ]*) "
+    | (1 =) (3 = *) (2 =) (4 =) (5 = *) (6 = *)|)
+   (" ([^Z]+)Z([^N ]*)N?([^ ]*) "
+    | (1 = *) (2 = *) (3 = *) |)
    ("." =))
   *))
 
-;; Step 5 : Substitutions & positioning.
-
+;; Apply other features.
+;; Do not apply 'vatu' and 'blws' if there is a ZWJ.
+;; The 'pres' feature is applied again for pre-base vowel sign.
 (generator
  (0
   (cond
-   (" ([^ ]*) "
-    ;; FIXME : The pres below is for the TTA ligature in the Utkal
-    ;; font.  It should be removed once the font is updated.
-    (1 otf:orya=vatu,abvs,blws,psts,pres))
+   (" J([^ ]+) "
+    (1 otf:orya=pres,abvs,pstp,haln))
+   (" ([^ ]+) "
+    (1 otf:orya=vatu,pres,abvs,blws,pstp,haln))
    ("."
     [ otf:orya=+ ]))
   *))