;; BENG-OTF.flt -- Font Layout Table for Bengali OpenType font
;; Copyright (C) 2004
;;   National Institute of Advanced Industrial Science and Technology (AIST)
;;   Registration Number H15PRO112

;; This file is part of the m17n database; a sub-part of the m17n
;; library.

;; The m17n library is free software; you can redistribute it and/or
;; modify it under the terms of the GNU Lesser General Public License
;; as published by the Free Software Foundation; either version 2.1 of
;; the License, or (at your option) any later version.

;; The m17n library is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; Lesser General Public License for more details.

;; You should have received a copy of the GNU Lesser General Public
;; License along with the m17n library; if not, write to the Free
;; Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
;; 02111-1307, USA.

;;; <li> BENG-OTF.flt
;;;
;;; For Bengali OpenType fonts to draw the Bengali script.  Tested with
;;; MuktiNarrow.ttf <http://www.nongnu.org/freebangfont/index.html>
;;; and
;;; LikhanNormal.otf <http:http://www.stat.wisc.edu/~deepayan/Bengali/WebPage/Font/fonts.html>

(category
 ;; C: consonant (excluding B, Y and R)
 ;; B: consonant BA (below)
 ;; Y: consonant YA (post)
 ;; R: consonant RA (reph, below)
 ;; n: NUKTA
 ;; H: HALANT
 ;; m: MATRA (pre)
 ;; b: MATRA (below)
 ;; p: MATRA (post)
 ;; t: MATRA (two-part)
 ;; A: vowel modifier (above)
 ;; a: vowel modifier (post)
 ;; V: independent vowel
 ;; N: ZWNJ (ZERO WIDTH NON-JOINER)
 ;; J: ZWJ (ZERO WIDTH JOINER)
 ;; E: ELSE
 ;;
 (0x200C	?N)			; ZWNJ
 (0x200D	?J)			; ZWJ
 (0x0964 0x0965	?E)			; DANDA, DOUBLE DANDA
 (0x0980 0x09FF	?E)			; ELSE
 (0x0981	?A)			; SIGN CANDRABINDU (above)
 (0x0982 0x0983	?a)			; SIGN ANUSWAR, VISARGA (post)
 (0x0985 0x098C	?V)			; LETTER A .. VOCALIC L
 (0x098F 0x0990	?V)			; LETTER E .. AI
 (0x0993 0x0994	?V)			; LETTER O .. AU
 (0x0995 0x09B9	?C)			; LETTER KA .. HA
 (0x09AC	?B)			; LETTER BA
 (0x09AF	?Y)			; LETTER YA
 (0x09B0	?R)			; LETTER RA
 (0x09BC	?n)			; SIGN NUKTA
 (0x09BE	?p)			; VOWEL SIGN AA (post)
 (0x09BF	?m)			; VOWEL SIGN I (pre)
 (0x09C0	?p)			; VOWEL SIGN II (post)
 (0x09C1 0x09C4	?b)			; VOWEL SIGN U, UU, R, RR (below)
 (0x09C7 0x09C8	?m)			; VOWEL SIGN E, AI (pre)
 (0x09CB 0x09CC ?t)			; VOWEL SIGN O, AU (two-part)
 (0x09CD	?H)			; SIGN VIRAMA (HASANT)
 (0x09CE	?C)			; LETTER KHANDA TA
 (0x09D7	?p)			; AU LENGTH MARK
 (0x09DC 0x09DD	?C)			; LETTER RRA, RHA
 (0x09DF	?C)			; LETTER YYA
 (0x09E0 0x09E1	?V)			; LETTER VOCALIC RR, LL
 (0x09E2 0x09E3	?b)			; VOWEL SIGN L .. LL (below)
 (0x09F0 0x09F1	?C)			; LETTER RR', RR'' (assamese)
 (0x09FE	?x)			; mark #1 (internal use)
 (0x09FF	?y)			; mark #2 (internal use)
 )

;; Step 1 : Syllable identification.  Recognised syllables are quoted
;; by the pseudo character, which is generated by the command "|" and
;; has the category " " (space).
(generator
 (0
  (cond
   ;; Case A-C are for those syllables that end with an explicit vowel
   ;; mark and/or a vowel modifier.  They are divided into three cases
   ;; for the readability of regular expression.  The leading
   ;; consonant-Hasant repetition is analysed for reordering in the
   ;; next step.  Two-part vowel, if any, is split for
   ;; canonicalisation.

   ;; Case A : A syllable ending with a vowel modifier.
   ;;1    23                4          5       6   7
   ("(RH)?(([CRBY]n?H[NJ]?)*([CRBY]n?))([mbp]*)(t)?([Aa])"
    < |
    (1 = =)
    (2 set-marks)
    (5 = *)
    (6 split)
    (7 =)
    | >)

   ;; Case B : A syllable ending with a two-part vowel.
   ;;1    23                4          5
   ("(RH)?(([CRBY]n?H[NJ]?)*([CRBY]n?))(t)"
    < |
    (1 = =)
    (2 set-marks)
    (5 split)
    | >)

   ;; Case C : A syllable ending with other vowel.  Note that a
   ;; two-part vowel may be expressed with two vowel marks for
   ;; backward compatibility.
   ;;1    23                4          5
   ("(RH)?(([CRBY]n?H[NJ]?)*([CRBY]n?))([mbp]+)"
    < |
    (1 = =)
    (2 set-marks)
    (5 = *)
    | >)

   ;; Case D : Ya-phalaa.  Reorder H and Y for the next step.
   ;; The web page "Unicode FAQ for Indic Scripts and Languages"
   ;; <http://www.unicode.org/faq/indic.html> says "it should be
   ;; permissible for the Ya-phalla to be consistently formed by "ZWNJ
   ;; + VIRAMA + YA".
   ("([CRBY]n?N)(H)(Y)"
    < |
    (1 = *)
    (3 =)
    (2 =)
    | >)

   ;; Case E : No explicit vowel nor modifier.  If the syllable ends
   ;; with a consonant, analyse it for reordering in the next step.
   ;; Otherwise, just identify the syllable without changing anything.
   ;;1    23                         4
   ("(RH)?(([CRBY]n?H[NJ]?)*[CRBY]n?)(HN|HJ|H)?"
    < |
    (1 = =)
    (2 set-marks)
    (4 = *)
    | >)

   ;; Case F : Syllables that begin with an independent vowel.  An
   ;; optional HYp sequence appears when this syllable represents the
   ;; sound "a" in English "bat" (see the FAQ above).  If it appears,
   ;; we reorder the H and Y for the next step.
   ("(V)(HYp)?([aA])?"
    < | (1 =) (2 ("HY(p)" 0x09AF 0x09CD (1 =))) (3 =) | >)

   ("." =))
  *)

 ;; Set mark #1 (x) at the position where below consonants begin, and
 ;; mark #2 (y) at the position to which below and above signs will be
 ;; moved.
 (set-marks
  (cond
   ;; Ending with Y.
   ;;1        2            3  45        6
   ("([CRBY]n?(H[NJ]?Cn?)*)(H)(([RB]H)*)(Y)"
    (1 = *)				; prebase & base
    0x09FE				; mark #1
    (4 = *)				; below consonants
    0x09FF				; mark #2
    (6 =)				; YA
    (3 =))				; moved HASANT
   ;; Ending with R or B.
   ;;1        2            3  45
   ("([CRBY]n?(H[NJ]?Cn?)*)(H)(([RB]H)*[RB])"
    (1 = *)				; prebase & base
    0x09FE				; mark #1
    (4 = *)				; below consonants 
    (3 =)				; moved HASANT
    0x09FF)				; mark #2
   (".+"
    = *
    0x09FE				; mark #1
    0x09FF)))				; mark #2

 ;; Split two-part dependent vowel signs for canonicalisation.
 (split
  (cond
   ((0x09CB)	0x09C7 0x09BE)
   ((0x09CC)	0x09C7 0x09D7)))
 )

;; Step 2 : Move Reph and Matra if necessary.  From now on, we care
;; only for those syllables that have been identified in Step 1.
(generator
 (0
  (cond
   ;; Special case: a single consonant and a Halant.
   (" (.)xy(H[NJ]?) "
    |
    0x09FE
    (1 =)
    (2 = *)
    0x09FE
    |)

   ;; This is the most generic pattern.  It follows Cases A, B, C and
   ;; E in Step 1.  Now Mark #1 is used to indicate the critical part
   ;; that requires pre-base substitution in the following steps.

   ;; 1    2         3        4    5   6   7   8   9   10
   (" (RH)?([^ xy]+)x([^ y]*)y(YH)?(m)?(b)?(p)?(A)?(a)?(HN|HJ|H)? "
    |
    (5 =)				; [Mpre]
    ;; Actually, the nukt feature is not necessary for Bengali because
    ;; all the necessary Nukta forms are precomposed in the Unicode
    ;; standard.  Even if a Nukta consonant is given in the form of
    ;; the combination of the base consonant and a Nukta sign, we can
    ;; safely perform the composition here because it does not affect
    ;; surrounding letters in the syllable.  The Akhand ligature
    ;; operation is also applied here, before applying the half form
    ;; operation because the Mukti font generates Akhand ligatures
    ;; directly from the "C H C" sequence, not via the half form.
    0x09FE				; begin Cpre & Cbase
    (2 otf:beng=nukt,akhn)		; {Cpre + H} + Cbase
    0x09FE				; end Cpre & Cbase
    (3 otf:beng=blwf)			; {Cbelow + H}
    (6 =)				; [Mbelow]
    (1 otf:beng=rphf)			; [Reph]
    (8 =)				; [VMabove]
    (4 otf:beng=pstf)			; [Cpost + H]
    (7 =)				; [Mpost]
    (9 =)				; [VMpost]
    (10 = *)				; optional HASANT
    |)

   ;; Syllables that begin with an independent vowel (following up
   ;; Step 1, Case F).  If a YH sequence exist, it is changed to the
   ;; post-base form.  Syllables of this type do not require further
   ;; modification.
   (" (V)(YH)(.*) "
    |
    (1 =)
    (2 otf:beng=pstf)
    (3 = *)
    |)

   ;; Ya-phalaa (following up Step 1, Case D).  Remove N and change YH
   ;; to the post base form.  Syllables of this type do not require
   ;; further modification.
   (" ([CBRY]n?)N(YH) "
    |
    (1 =)
    (2 otf:beng=pstf)
    |)

   ("." =))
  *))

;; Step 3 : Now only those syllables that contain the pseudo character
;; x require pre-base substition.  This is the most complicated part
;; in this FLT.

;; If the sequence "C1 H C2" makes ligature L12, L12 replaces the
;; original sequence.

;; To test the availability of such a ligature, we try to generate it
;; using the pre-base substitute feature, then see whether succeeded
;; or not.  In the case of failure, the pre-base feature does not
;; change the original sequence.

;; To create a ligature, the "C1 H" part must be first converted into
;; the half form of C1.  Creating the half form of a consonant always
;; succeeds.

;; ligature(half(C1,H),C2)
;; ==> ligature(C1half,C2)
;; ==> L12         ; success
;;     C1half C2   ; fail

;; If the ligature is not available, the "C1 H" part must be converted
;; into the _Halant_ (not half) form of C1.  However, there is no way
;; to reconvert C1half into C1halant nor to revert back to "C1 H".
;; Thus we duplicate the critical part in two different forms so that
;; we can select the appropriate one in the next step.  The pseudo
;; character x is used to indicate the boundaries.

;; ... C1 H C2 ...  ==>  ... x C1halant C2 x L12 x ...

;; If the length of the L12 part is one, ligature generation was
;; successful.  In this case we wipe out the duplicated C1halant and
;; C2.  Otherwise we remove L12.

;; In very few cases (I found only one in the Mukti font), the "C1 H"
;; part need to be converted into C1halant (instead of C1half) to make
;; a ligature with C2.  So when we try to generate a ligature form, we
;; apply the GSUB features "half", "haln" and "pres" in this order.

(category
 ;; C: consonant (excluding B, Y and R)
 ;; H: HALANT
 ;; N: ZWNJ (ZERO WIDTH NON-JOINER)
 ;; J: ZWJ (ZERO WIDTH JOINER)
 ;; E: ELSE
 ;;
 (0x200C	?N)			; ZWNJ
 (0x200D	?J)			; ZWJ
 (0x0964 0x0965	?E)			; DANDA, DOUBLE DANDA
 (0x0980 0x09FF	?E)			; ELSE
 (0x09CD	?H)			; SIGN VIRAMA (HASANT)
 (0x0995	?K)			; LETTER KA
 (0x09B7	?S)			; LETTER SSA
 (0x09A3	?M)			; LETTER NNA
 (0x09AE	?M)			; LETTER MA
 (0x09FE	?x)			; mark #1 (internal use)
 )

(generator
 (0
  (cond

   ;; One pre-base and base.
   ;; 1        23   4       5    6
   (" ([^x ]*)x((.H)([^NJ]))(H)?x([^ ]*) "
    |
    (1 = *)
    0x09FE				; x
    (3 otf:beng=haln)			; C1halant
    (4 =)				; C2
    0x09FE				; x
    (2 otf:beng=half,haln,pres)		; ligature result
    0x09FE				; x
    (5 =)
    (6 = *)
    |)

   ;; One pre-base with ZWJ.  According to the Unicode FAQ, the half
   ;; form is forced in this case.  So we fake as if ligature
   ;; generation was failed.
   (" ([^x ]*)x(.H)J(.)?x([^ ]*) "
    |
    (1 = *)
    0x09FE				; x
    (2 otf:beng=half)			; C1half
    (3 =)				; C2
    0x09FE				; x
    0x09FD				; pseudo result
    0x09FD				; pseudo result
    0x09FE				; x
    (4 = *)
    |)

   ;; One pre-base possibly with ZWNJ.  Similar to above.
   (" ([^x ]*)x(.H)N?(.)?x([^ ]*) "
    |
    (1 = *)
    0x09FE				; x
    (2 otf:beng=haln)			; C1halant
    (3 =)				; C2
    0x09FE				; x
    0x09FD				; pseudo result
    0x09FD				; pseudo result
    0x09FE				; x
    (4 = *)
    |)

   ;; Standalone base.  There is nothing more to do.
   (" ([^x ]*)x(.)x([^ ]*) "
    |
    (1 = *)
    (2 =)
    (3 = *)
    |)

   ;; KA-SSA-NNA and KA-SSA-MA are the only pre-base ligatures that
   ;; consist of three consonants.
   ;; 1        23   4   5   6    7
   (" ([^x ]*)x((KH)(SH)(M))(H)?x([^ ]*) "
    |
    (1 = *)
    0x09FE				; x
    (3 otf:beng=haln)			; KAhalant
    (4 otf:beng=haln)			; SSAhalant
    (5 =)				; NNA or MA
    0x09FE				; x
    (2 otf:beng=half,haln,pres)		; ligature result
    0x09FE				; x
    (6 =)
    (7 = *)
    |)

   ;; Two or more pre-bases plus base.  Give up.  Convert all
   ;; pre-bases into halant form.
   ;; 1        23             4       5
   (" ([^x ]*)x(([^x]H[JN]?)+)([^x])?x([^ ]*) "
    |
    (1 = *)
    0x09FE				; x
    (2 force-haln)			; halant forms
    (4 =)				; full form
    0x09FE				; x
    0x09FD				; pseudo result
    0x09FD				; pseudo result
    0x09FE				; x
    (5 = *)
    |)

   ("." =))
  *)

 ;; This is to remove ZWNJ and ZWJ.  The half-form-force-effect of ZWJ
 ;; is ignored.  Sorry.
 (force-haln
  (cond
   ("([^JN]*)[JN](.*)"
    (1 otf:beng=haln)
    (2 force-haln))
   (".+"
    otf:beng=haln)))
 )

;; Step 4 : Select the appropriate representation.  Only those
;; syllables that contain the virtual character x require
;; modification.
(generator
 (0
  (cond
   ;; Only one glyph in the ligature section (between the second and
   ;; the third x).  It means a ligature was successfully generated.
   ;; C1halant and C2 (between the first and second x) are removed.
   (" ([^x ]*)x[^x]+x(.)x([^ ]*) "
    |
    (1 = *)
    (2 =)
    (3 = *)
    |)

   ;; Otherwise halant and base forms are used.  The failed ligature
   ;; is removed.
   (" ([^x ]*)x([^x]+)x[^x]+x([^ ]*) "
    |
    (1 = *)
    (2 = *)
    (3 = *)
    |)

   ;; No need to care the other cases.
   ("." =))
  *))

;; Step 5 : Select appropriate glyph variants for fine adjustments.
;; Now the syllable boundary marks are removed so that the final step
;; can find word boundaries.
(generator
 (0
  (cond
   (" ([^ ]+) "
    (1 otf:beng=blws,abvs,psts,vatu))
   ("."
    [ otf:beng=+ ] ))
  *)
 )

;; Step 6 : Word initial substitute.  As the syllable boundaries have
;; been eliminated in the previous step, this rule is applied to a run
;; of Bengali glyphs, i.e. word by word.  We finally apply the init
;; feature to the word initial gylphs to get the final result.
(generator
 (0
  ("(.)(.*)"
   (1 otf:beng=init)
   (2 = *))))

;; Local Variables:
;; mode: emacs-lisp
;; End: