chise/parser.rb

   1 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
   2
   3 module CHISE
   4   class CharacterParser
   5     def parse(c) # parse a value and return a number (MCS)
   6       raise "c is nil" if c.nil?
   7
   8       if c.kind_of?(String)
   9         if /\A\?/ =~ c
  10           c = c.sub(/\A\?/, "") # remove "?" in the head
  11           u4 = c.u8tou32 # translate from UTF-8 to UTF-32
  12           return u4.u32to_i # translate UTF-32 to UCS number
  13         end
  14
  15         return parse_er(c) if is_er?(c) # ER?
  16
  17         return c.to_i if /^\d+$/ =~ c # only numbers?
  18
  19         raise "unknown format"
  20       end
  21
  22       if c.kind_of?(Numeric)
  23         c = 0x80000000 + c if c < 0 # negative value
  24         return c.to_i
  25       end
  26
  27       raise "unknown object"
  28     end
  29
  30     PART    = "&([-+0-9A-Za-z#]+);"
  31     ALL     = "\\A#{PART}\\Z"
  32     PART_RE = Regexp.new(PART)
  33     ALL_RE  = Regexp.new(ALL)
  34
  35     def contain_er?(s)  (PART_RE =~ s) != nil;  end
  36     def is_er?(s)       (ALL_RE =~ s) != nil;   end
  37
  38     # the order is important.  The primary charset should be selectable.
  39     CODESYS_TABLE = [
  40       %w( =jis-x0208-1990       J90- 4 X),
  41       %w( =jis-x0208-1983       J83- 4 X),
  42       %w( =jis-x0208-1978       J78- 4 X),
  43       %w( =jis-x0208            J90- 4 X), # \8cp\8f³\82Ì\83A\83h\83z\83b\83N\82È\8eÀ\91\95
  44       %w( =jis-x0208            J83- 4 X), # \8cp\8f³\82Ì\83A\83h\83z\83b\83N\82È\8eÀ\91\95
  45       %w( =jis-x0208            J78- 4 X), # \8cp\8f³\82Ì\83A\83h\83z\83b\83N\82È\8eÀ\91\95
  46       %w( =jis-x0213-1-2000     JX1- 4 X),
  47       %w( =jis-x0213-2-2000     JX2- 4 X),
  48       %w( =jis-x0212            JSP- 4 X),
  49       %w( =big5-cdp             CDP- 4 X),
  50       %w( =cns11643-1           C1-  4 X),
  51       %w( =cns11643-2           C2-  4 X),
  52       %w( =cns11643-3           C3-  4 X),
  53       %w( =cns11643-4           C4-  4 X),
  54       %w( =cns11643-5           C5-  4 X),
  55       %w( =cns11643-6           C6-  4 X),
  56       %w( =cns11643-7           C7-  4 X),
  57       %w( =ks-x1001             K0-  4 X),
  58       %w( =daikanwa             M-   5 d),
  59       %w( =cbeta                CB   5 d),
  60       %w( =gt                   GT-  5 d),
  61       %w( =gt-k                 GT-K 5 d),
  62     ]
  63     PRIVATE_USE_AREA = 0xe000
  64
  65     def parse_er(s) # parse a Entity Reference and return a number (MCS)
  66       raise "wrong ER." unless ALL_RE =~ s # don't use is_er? for getting $1.
  67
  68       s = $1 # extract the part of ER
  69
  70       return $1.hex if s =~ /\AMCS-([0-9A-Fa-f]+)\Z/ # MCS. It's a mystery.
  71
  72       return $1.hex if s =~ /\AU[-+]?([0-9A-Fa-f]+)\Z/ ||
  73           s =~ /\A#x([0-9A-Fa-f]+)\Z/ # Unicode code point in Hex.
  74
  75       return $1.to_i if s =~ /\A#([0-9]+)\Z/ # Unicode code point in Decimal.
  76
  77       if s =~ /\Amy-([0-9]+)\Z/ # my own code point. It's a secret.
  78         return PRIVATE_USE_AREA + $1.to_i # private use area of Unicode.
  79       end
  80
  81       if s =~ /\AI-/ # I- stands for Isolated character. It's a wonder.
  82         s = s.sub(/\AI-/, "")
  83       end
  84
  85       CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype|
  86         if numtype == "d"
  87           nre = '\d'
  88         elsif numtype == "X"
  89           nre = "[0-9A-Fa-f]"
  90         else
  91           next
  92         end
  93
  94         re = "\\A#{er_prefix}(#{nre}{#{keta},#{keta}})\\Z"
  95         next unless Regexp.new(re) =~ s
  96
  97         codestr = $1
  98         if numtype == "d"
  99           code = codestr.to_i
 100         else
 101           code = codestr.hex
 102         end
 103
 104         u8 = CodesysDB.instance.get(codesys, code)
 105         next if u8.nil?
 106
 107         num = parse("?"+u8)
 108         next if num.nil?
 109
 110         return num
 111       }
 112
 113       raise "unknown Entity Reference"
 114     end
 115
 116   end
 117 end