From f87bb7feacfe97e675fd65ef0b3da18a17c52930 Mon Sep 17 00:00:00 2001 From: akr Date: Sun, 16 Aug 1998 11:40:53 +0000 Subject: [PATCH] * FLIM-ELS (flim-modules): Add `closure', `natset', `digraph', `automata', `rx', `lex', `lr-driver', `ew-util', `ew-line', `ew-quote', `ew-unit', `ew-data', `ew-parse', `ew-scan-s', `ew-scan-m', `ew-scan-u' and `ew-dec'. * eword-decode.el: Require 'ew-dec. (eword-decode-header): Use `ew-decode-field'. (eword-decode-and-unfold-structure): Ditto. (eword-decode-structured-field-body): Ditto. (eword-decode-unstructured-field-body): Ditto. * automata.el, closure.el, digraph.el, ew-data.el, ew-dec.el, ew-line.el, parser, ew-quote.el, ew-scan-m.el, ew-scan-s.el, ew-scan-u.el, ew-unit.el, ew-util.el, lex.el, lr-driver.el, natset.el, rx.el: New files --- ChangeLog | 22 +++ FLIM-ELS | 18 +- automata.el | 499 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ closure.el | 44 +++++ digraph.el | 347 ++++++++++++++++++++++++++++++++++++++ ew-data.el | 138 +++++++++++++++ ew-dec.el | 414 +++++++++++++++++++++++++++++++++++++++++++++ ew-line.el | 174 +++++++++++++++++++ ew-quote.el | 199 ++++++++++++++++++++++ ew-scan-m.el | 152 +++++++++++++++++ ew-scan-s.el | 157 +++++++++++++++++ ew-scan-u.el | 56 +++++++ ew-unit.el | 90 ++++++++++ ew-util.el | 19 +++ eword-decode.el | 75 +++++---- lex.el | 196 ++++++++++++++++++++++ lr-driver.el | 72 ++++++++ natset.el | 242 +++++++++++++++++++++++++++ rx.el | 394 +++++++++++++++++++++++++++++++++++++++++++ 19 files changed, 3276 insertions(+), 32 deletions(-) create mode 100644 automata.el create mode 100644 closure.el create mode 100644 digraph.el create mode 100644 ew-data.el create mode 100644 ew-dec.el create mode 100644 ew-line.el create mode 100644 ew-quote.el create mode 100644 ew-scan-m.el create mode 100644 ew-scan-s.el create mode 100644 ew-scan-u.el create mode 100644 ew-unit.el create mode 100644 ew-util.el create mode 100644 lex.el create mode 100644 lr-driver.el create mode 100644 natset.el create mode 100644 rx.el diff --git a/ChangeLog b/ChangeLog index d685a24..cc51175 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,27 @@ 1998-08-16 Tanaka Akira + * FLIM-ELS (flim-modules): Add `closure', `natset', `digraph', + `automata', `rx', `lex', `lr-driver', `ew-util', `ew-line', + `ew-quote', `ew-unit', `ew-data', `ew-parse', `ew-scan-s', + `ew-scan-m', `ew-scan-u' and `ew-dec'. + + * eword-decode.el: Require 'ew-dec. + (eword-decode-header): Use `ew-decode-field'. + (eword-decode-and-unfold-structure): Ditto. + (eword-decode-structured-field-body): Ditto. + (eword-decode-unstructured-field-body): Ditto. + + * automata.el, closure.el, digraph.el, ew-data.el, ew-dec.el, + ew-line.el, parser, ew-quote.el, ew-scan-m.el, ew-scan-s.el, + ew-scan-u.el, ew-unit.el, ew-util.el, lex.el, lr-driver.el, + natset.el, rx.el: New files + +1998-08-16 Tanaka Akira + + * DOODLE: branched from flam-1_9_1 + +1998-08-16 Tanaka Akira + * mime-def.el (mime-library-version-string): bump up to 1.9.1. 1998-07-22 Tanaka Akira diff --git a/FLIM-ELS b/FLIM-ELS index 3e53c65..8763042 100644 --- a/FLIM-ELS +++ b/FLIM-ELS @@ -10,7 +10,23 @@ eword-decode eword-encode mime mime-parse mmbuffer mmcooked ;; mime-lib - mailcap)) + mailcap + closure + natset + digraph + automata rx lex + lr-driver + ew-util + ew-line + ew-quote + ew-unit + ew-data + ew-parse + ew-scan-s + ew-scan-m + ew-scan-u + ew-dec + )) (if (fboundp 'dynamic-link) (setq flim-modules (cons 'mel-dl flim-modules)) diff --git a/automata.el b/automata.el new file mode 100644 index 0000000..eac194f --- /dev/null +++ b/automata.el @@ -0,0 +1,499 @@ + +(require 'digraph) +(require 'natset) +(provide 'automata) + +(defvar automata-never-fail (make-symbol "automata-never-fail")) +(defvar automata-state-var (make-symbol "automata-state")) + +(defmacro automata (in-var start-tag &rest clauses) + (let* ((org-len (length clauses)) + (org-graph (make-vector org-len nil)) + (tag-to-org-alist nil) + forest org-to-forest forest-to-org + i j tmp trans) + (setq tmp clauses + i 0) + (while tmp + (setq tag-to-org-alist (cons (cons (caar tmp) i) tag-to-org-alist) + i (1+ i) + tmp (cdr tmp))) + (setq tmp clauses + i 0) + (while tmp + (setq trans (cddr (cdar tmp))) + (while trans + (setq j (cdr (assoc (cadr (car trans)) tag-to-org-alist))) + (if (not (member j (aref org-graph i))) + (aset org-graph i (cons j (aref org-graph i)))) + (setq trans (cdr trans))) + (setq i (1+ i) + tmp (cdr tmp))) + ;;(error "%s" org-graph) + (setq tmp (digraph-split-as-forest org-graph) + forest (aref tmp 0) + org-to-forest (aref tmp 1) + forest-to-org (aref tmp 2)) + (setq clauses + (mapcar + (lambda (c) + (let ((tag (car c)) + (action (cadr c)) + (fail (nth 2 c)) + (trs (nthcdr 3 c))) + (setq trs + (mapcar + (lambda (next) + (list + (apply + 'natset-union + (mapcar + (lambda (tr) (if (equal (cadr tr) next) (car tr) (natset-empty))) + trs)) + next)) + (natset-enum (apply 'natset-single (mapcar 'cadr trs))))) + (cons tag (cons action (cons fail trs))))) + clauses)) + `(let ((,automata-state-var ,(cdr (assoc start-tag tag-to-org-alist)))) + ,@(automata-exp-seq + (automata-gen-state + in-var clauses + org-len + (list (cdr (assoc start-tag tag-to-org-alist))) + tag-to-org-alist + (aref org-to-forest (cdr (assoc start-tag tag-to-org-alist))) + forest org-to-forest forest-to-org))))) + +(defun automata-gen-state (in-var clauses + org-len + start-states + tag-to-org-alist + forest-state forest org-to-forest forest-to-org) + (let* ((org-states (aref forest-to-org forest-state)) + (forest-states (digraph-descents forest forest-state)) + (branch-length (+ (length org-states) (length forest-states))) + (branch-to-org + (vconcat + (mapcar 'list org-states) + (mapcar (lambda (forest-state) + (aref forest-to-org forest-state)) + forest-states))) + (org-to-branch + (let ((org-to-branch (make-vector org-len nil)) + (i 0) tmp) + (while (< i branch-length) + (setq tmp (aref branch-to-org i)) + (while tmp + (aset org-to-branch (car tmp) i) + (setq tmp (cdr tmp))) + (setq i (1+ i))) + org-to-branch)) + (branch-to-forest + (vconcat + (make-list (length org-states) nil) + forest-states)) + (branch-state-range + (vconcat + (mapcar 'natset-single org-states) + (mapcar (lambda (forest-state) + (apply 'natset-single + (aref forest-to-org forest-state))) + forest-states))) + (branch-descents + (vconcat + (mapcar (lambda (org-state) + (let* ((c (nth org-state clauses)) + (trs (nthcdr 3 c))) + (apply 'natset-union + (mapcar (lambda (tr) + (natset-single + (cdr (assoc (cadr tr) tag-to-org-alist)))) + trs)))) + org-states) + (mapcar (lambda (forest-state) ()) + forest-states))) + (all-descents (apply 'natset-union (append branch-descents ()))) + (branch-ascents + (let* ((branch-ascents (make-vector branch-length 0)) + (i 0) j) + (while (< i branch-length) + (setq j 0) + (while (< j branch-length) + (if (natset-has-intersection-p (aref branch-state-range i) + (aref branch-descents j)) + (aset branch-ascents i + (1+ (aref branch-ascents i)))) + (setq j (1+ j))) + (setq i (1+ i))) + branch-ascents)) + (start-inline nil) + (branch-inline + (let* ((branch-inline (make-vector branch-length nil)) + (start-ns (apply 'natset-single start-states)) + (i 0)) + (while (< i branch-length) + (if (natset-has-intersection-p (aref branch-state-range i) start-ns) + (if (and (= (length start-states) 1) + (= (aref branch-ascents i) 0)) + (progn + (setq start-inline i) + (aset branch-inline i t)) + nil) + (if (= (aref branch-ascents i) 1) + (aset branch-inline i t))) + (setq i (1+ i))) + branch-inline)) + (branch-gen nil) + (_ + (setq branch-gen + (vconcat + (mapcar + (lambda (org-state) + (cons + (lambda (org-state) + `(,(natset-single org-state) + nil ; don't duplicate. + ,@(let* ((c (nth org-state clauses))) + (automata-exp-seq + (nth 1 c) ; action + (if (null (nthcdr 3 c)) + (nth 2 c) + `(automata-branch + ,in-var ; input variable + ,(natset-full) ; input is unpredictable. + ,(nth 2 c) ; fail action + ,@(let ((trs (nthcdr 3 c))) + (mapcar + (lambda (next-branch) + (let ((input-range + (apply 'natset-union + (mapcar + (lambda (tr) + (if (member (cdr (assoc (cadr tr) tag-to-org-alist)) + (aref branch-to-org next-branch)) + (car tr) + (natset-empty))) + trs)))) + `(,input-range ; input range + ,(not (aref branch-inline next-branch)) ; duplicatable unless inlining. + ,(let ((goto-list (apply + 'append + (mapcar + (lambda (tr) + (let ((range (natset-intersection (car tr) input-range))) + (if range + `((,range + t + (automata-goto + ,automata-state-var + ,org-state + ,(cdr (assoc (cadr tr) tag-to-org-alist))))) + ()))) + trs)))) + (if (= (length goto-list) 1) + (car (cddr (car goto-list))) + `(automata-branch + ,in-var + ,input-range + ,automata-never-fail + ,@goto-list))) + ,@(if (aref branch-inline next-branch) + (automata-exp-seq + `(progn ,@(cddr (funcall (car (aref branch-gen next-branch)) + (cdr (aref branch-gen next-branch)))))) + ())))) + (natset-enum + (apply 'natset-union + (mapcar + (lambda (tr) + (natset-single + (aref org-to-branch + (cdr (assoc (cadr tr) tag-to-org-alist))))) + trs)))) + ))))))) + org-state)) + org-states) + (mapcar + (lambda (forest-state) + (cons + (lambda (forest-state) + `(,(natset-intersection + (apply 'natset-single (aref forest-to-org forest-state)) + all-descents) ; state range + nil ; don't duplicate. + ,@(automata-exp-seq + (automata-gen-state + in-var clauses + org-len + (aref forest-to-org forest-state) + tag-to-org-alist + forest-state forest org-to-forest forest-to-org)))) + forest-state)) + forest-states)))) + (branches + (let* ((branches ()) + (i branch-length)) + (while (< 0 i) + (setq i (1- i)) + (if (not (aref branch-inline i)) + (setq branches + (cons + (funcall (car (aref branch-gen i)) + (cdr (aref branch-gen i))) + branches)))) + branches)) + ) + ;;(error "err") + (if start-inline + (apply + 'automata-seq-exp + `(progn ,@(cddr (funcall (car (aref branch-gen start-inline)) + (cdr (aref branch-gen start-inline))))) + (cond + ((null branches) ()) + ((null (cdr branches)) + (cddr (car branches))) + (t + `((while t + (automata-branch + ,automata-state-var ,(natset-full) ,automata-never-fail + ,@branches)))))) + (if (= (length branches) 1) + `(while t ,@(cddr (car branches))) + `(while t ; ,branch-inline ,branch-state-range ,branch-descents ,branch-ascents + (automata-branch + ,automata-state-var ,(natset-full) ,automata-never-fail + ,@branches)))))) + +(defun automata-seq-exp (&rest seq) + (cond + ((null seq) nil) + ((null (cdr seq)) + (car seq)) + (t + (setq seq + (apply + 'append + (mapcar + (lambda (exp) (if (and (consp exp) (eq (car exp) 'progn)) + (cdr exp) + (list exp))) + seq))) + (let ((rseq (reverse seq))) + (cons 'progn + (apply + 'append + (apply + 'append + (mapcar + (lambda (exp) (if (null exp) () (list exp))) + (nreverse (cdr rseq)))) + (list (list (car rseq))))))))) + +(defun automata-exp-seq (&rest seq) + (let ((exp (apply 'automata-seq-exp seq))) + (if (and (consp exp) (eq (car exp) 'progn)) + (cdr exp) + (list exp)))) + +(defmacro automata-goto (var curr next) + (if (eq curr next) + nil + `(setq ,var ,next))) + +(defmacro automata-branch (var range fail &rest clauses) + (when (eq fail automata-never-fail) + (setq range (natset-intersection + (apply 'natset-union (mapcar 'car clauses)) + range))) + (let ((len (length clauses)) + ns-list dup-list body-list tmp ns) + (setq tmp clauses + ns (natset-negate range)) + (while tmp + (setq ns-list (cons (natset-sub (caar tmp) ns) ns-list) + dup-list (cons (cadr (car tmp)) dup-list) + body-list (cons (cddr (car tmp)) body-list) + ns (natset-union ns (caar tmp)) + tmp (cdr tmp)) + (if (natset-empty-p (car ns-list)) + (setq ns-list (cdr ns-list) + dup-list (cdr dup-list) + body-list (cdr body-list)))) + (setq ns-list (nreverse ns-list) + dup-list (nreverse dup-list) + body-list (nreverse body-list)) + (automata-branch-i var range fail ns-list dup-list body-list))) + +(defun automata-branch-i (var range fail ns-list dup-list body-list) + (cond + ((null ns-list) fail) + ((null (cdr ns-list)) + (if (natset-include-p (car ns-list) range) + (apply 'automata-seq-exp (car body-list)) + `(if ,(natset-gen-pred-exp (car ns-list) var range) + ,(apply 'automata-seq-exp (car body-list)) + ,fail))) + (t + (let (tmp tmpn tmpd cut) + (setq tmpn ns-list + cut (natset-empty)) + (while tmpn + (setq cut (natset-union cut (natset-boundary-set (car tmpn))) + tmpn (cdr tmpn))) + (setq tmpn ns-list + tmpd dup-list) + (while tmpn + (if (not (car tmpd)) + (setq tmp (natset-minmax (car tmpn)) + tmp (natset-sub tmp (natset-start-set tmp)) + cut (natset-sub cut tmp))) + (setq tmpn (cdr tmpn) + tmpd (cdr tmpd))) + (setq cut (natset-sub cut (natset-boundary-set (natset-minmax range)))) + (if (null (setq cut (natset-enum cut))) + `(if ,(natset-gen-pred-exp (car ns-list) var range) + ,(apply 'automata-seq-exp (car body-list)) + ,(automata-branch-i var + (natset-sub range (car ns-list)) + fail + (cdr ns-list) + (cdr dup-list) + (cdr body-list))) + (let* ((mid (nth (/ (length cut) 2) cut)) + (lower (natset-seg 0 (1- mid))) + (higher (natset-seg mid)) + ns-list1 dup-list1 body-list1 + ns-list2 dup-list2 body-list2 + ) + (while ns-list + (if (natset-has-intersection-p lower (car ns-list)) + (setq ns-list1 (cons (natset-intersection (car ns-list) lower) ns-list1) + dup-list1 (cons (car dup-list) dup-list1) + body-list1 (cons (car body-list) body-list1))) + (if (natset-has-intersection-p higher (car ns-list)) + (setq ns-list2 (cons (natset-intersection (car ns-list) higher) ns-list2) + dup-list2 (cons (car dup-list) dup-list2) + body-list2 (cons (car body-list) body-list2))) + (setq ns-list (cdr ns-list) + dup-list (cdr dup-list) + body-list (cdr body-list))) + ;;(if (or (null ns-list1) (null ns-list2)) (error "divide fail")) + `(if (< ,var ,mid) + ,(automata-branch-i var + (natset-intersection range lower) + fail ns-list1 dup-list1 body-list1) + ,(automata-branch-i var + (natset-intersection range higher) + fail ns-list2 dup-list2 body-list2)))))))) + + +'( +(npp (macroexpand ' +(automata pc 0 + (1 + (progn + (lex-match 1) + (lex-scan-unibyte-save)) + (lex-fail)) + (5 + (progn + (lex-match 2) + (lex-scan-unibyte-save)) + (lex-fail)) + (4 + (progn + (lex-scan-unibyte-read pc)) + (lex-fail) + ((9 10) 5) ((32 33) 5)) + (2 + (progn + (lex-match 3) + (lex-scan-unibyte-save) + (lex-scan-unibyte-read pc)) + (lex-fail) + ((13 14) 2) + ((0 9) 3) ((11 13) 3) ((14 32) 3) ((33) 3) + ((10 11) 4)) + (3 + (progn + (lex-match 3) + (lex-scan-unibyte-save) + (lex-scan-unibyte-read pc)) + (lex-fail) + ((0 9) 3) ((10 11) 3) ((11 13) 3) ((14 32) 3) ((33) 3) + ((13 14) 2) + ) + (0 + (progn + (lex-match 3) + (lex-scan-unibyte-save) + (lex-scan-unibyte-read pc)) + (lex-fail) + ((0 9) 3) ((10 11) 3) ((11 13) 3) ((14 32) 3) ((33) 3) + ((13 14) 2) + ((9 10) 1) ((32 33) 1))))) + +(npp (macroexpand ' +(automata pc 0 + (0 + (progn + (lex-match 3) + (lex-scan-unibyte-save) + (lex-scan-unibyte-read pc)) + (lex-fail) + ((0 9) 3) ((10 11) 3) ((11 13) 3) ((14 32) 3) ((33) 3) + ((13 14) 2) + ((9 10) 1) ((32 33) 1)) + (1 + (progn + (lex-match 1) + (lex-scan-unibyte-save)) + (lex-fail)) + (2 + (progn + (lex-match 3) + (lex-scan-unibyte-save) + (lex-scan-unibyte-read pc)) + (lex-fail) + ((13 14) 2) + ((0 9) 3) ((11 13) 3) ((14 32) 3) ((33) 3) + ((10 11) 4)) + (3 + (progn + (lex-match 3) + (lex-scan-unibyte-save) + (lex-scan-unibyte-read pc)) + (lex-fail) + ((0 9) 3) ((10 11) 3) ((11 13) 3) ((14 32) 3) ((33) 3) + ((13 14) 2) + ) + (4 + (progn + (lex-scan-unibyte-read pc)) + (lex-fail) + ((9 10) 5) ((32 33) 5)) + (5 + (progn + (lex-match 2) + (lex-scan-unibyte-save)) + (lex-fail)) +))) + +(npp +(automata-gen-state +'pc +'((0 (progn (lex-match 3) (lex-scan-unibyte-save) (lex-scan-unibyte-read pc)) (lex-fail) ((9 10 32 33) 1) ((13 14) 2) ((0 9 10 13 14 32 33) 3)) + (1 (progn (lex-match 1) (lex-scan-unibyte-save)) (lex-fail)) + (2 (progn (lex-match 3) (lex-scan-unibyte-save) (lex-scan-unibyte-read pc)) (lex-fail) ((13 14) 2) ((0 9 11 13 14 32 33) 3) ((10 11) 4)) + (3 (progn (lex-match 3) (lex-scan-unibyte-save) (lex-scan-unibyte-read pc)) (lex-fail) ((13 14) 2) ((0 9 10 13 14 32 33) 3)) + (4 (progn (lex-scan-unibyte-read pc)) (lex-fail) ((9 10 32 33) 5)) + (5 (progn (lex-match 2) (lex-scan-unibyte-save)) (lex-fail))) +6 +'(0) +'((5 . 5) (4 . 4) (3 . 3) (2 . 2) (1 . 1) (0 . 0)) +0 +[(4 1) (2) (3) nil nil] +[0 4 1 1 2 3] +[(0) (3 2) (4) (5) (1)])) +) \ No newline at end of file diff --git a/closure.el b/closure.el new file mode 100644 index 0000000..fabab4f --- /dev/null +++ b/closure.el @@ -0,0 +1,44 @@ +(provide 'closure) + +(defmacro closure-make (fun &rest fvs) + "Make closure from function FUN and free variable list FVS. +CAUTION: Do not assign to free variables." + (let* ((funv (make-symbol "funv")) + (args (make-symbol "args"))) + `(list + ,fun + (lambda (,funv ,args ,@fvs) + (apply ,funv ,args)) + ,@fvs))) + +(defun closure-call (clo &rest args) + "Call closure." + (if (functionp clo) + (apply clo args) + (apply (cadr clo) (car clo) args (cddr clo)))) + +(defun closure-compose (c1 c2) + "Compose C1 and C2. + +If either C1 or C2 is nil, another one is returned. +If C1 and C2 is non-nil, C1 must be closure with one argument." + (cond + ((null c1) c2) + ((null c2) c1) + (t + (closure-make + (lambda (&rest args) + (closure-call c1 (apply 'closure-call c2 args))) + c1 c2)))) + +'( + +(setq c1 (let ((a 1)) (closure-make (lambda (b) (+ a b)) a))) +(closure-call c1 2) ; => 3 + +(let ((a 1)) (setq plus1 (closure-make (lambda (b) (+ a b)) a))) +(let ((a 2)) (setq plus2 (closure-make (lambda (b) (+ a b)) a))) +(setq plus3 (closure-compose plus1 plus2)) +(closure-call plus3 4) ; => 7 + +) \ No newline at end of file diff --git a/digraph.el b/digraph.el new file mode 100644 index 0000000..2a463d1 --- /dev/null +++ b/digraph.el @@ -0,0 +1,347 @@ +;;; directed graph package. +(provide 'digraph) + +;; A directed graph is represented as vector of lists of integers. +;; The number of nodes in the graph is length of the vector. +;; +;; i'th list of the vector contains j <=> there is the edge from i to j. + +(defalias 'digraph-descents 'aref) + +(defun digraph-split-as-dag (g) + "Returns 3 element vector of follows. + +0. Directed acyclic graph generated by mergeing each strongly connected +components in G as new nodes. + +1. Map from a node in g to a node in result. + +2. Map from a node in result to nodes in g." + (let* ((old-len (length g)) + (new-to-olds (vconcat (digraph-tsort g))) + (new-len (length new-to-olds)) + (old-to-new (make-vector old-len ())) + (i 0) + js res) + (while (< i new-len) + (setq js (aref new-to-olds i)) + (while js + (aset old-to-new (car js) i) + (setq js (cdr js))) + (setq i (1+ i))) + (setq i (1- new-len)) + (while (<= 0 i) + (setq res (cons + (digraph-list-uniq + (apply + 'nconc + (mapcar + (lambda (old) + (apply + 'nconc + (mapcar + (lambda (old) + (if (= i (aref old-to-new old)) + () + (list (aref old-to-new old)))) + (aref g old)))) + (aref new-to-olds i)))) + res) + i (1- i))) + (vector + (vconcat res) + old-to-new + new-to-olds))) + +(defun digraph-split-as-forest (g) + "Returns 3 element vector of follows. + +0. Tree generated by merging nodes which have common descent node. + +1. Map from a node in g to a node in result. + +2. Map from a node in result to nodes in g." + (let* ((tmp (digraph-split-as-dag g)) + (d (aref tmp 0)) + (g-to-d (aref tmp 1)) + (d-to-g (aref tmp 2)) + ;;(_ (error "%s" tmp)) + (tmp (digraph-dag-to-forest d)) + (f (aref tmp 0)) + (d-to-f (aref tmp 1)) + (f-to-d (aref tmp 2)) + old-indices + new-indices + i) + (setq i (1- (length g))) + (while (<= 0 i) + (setq old-indices (cons i old-indices) + i (1- i))) + (setq i (1- (length f))) + (while (<= 0 i) + (setq new-indices (cons i new-indices) + i (1- i))) + (vector + f + (vconcat + (mapcar + (lambda (gi) (aref d-to-f (aref g-to-d gi))) + old-indices)) + (vconcat + (mapcar + (lambda (fi) + (apply + 'nconc + (mapcar + (lambda (di) (aref d-to-g di)) + (aref f-to-d fi)))) + new-indices))))) + +(defun digraph-tsort (dep) + "Sort nodes in a graph toporogicaly. + +DEP is a vector of lists of integers and +digraph-tsort returns list of lists of integers. + +The graph has (length DEP) nodes. + +Dependency for i'th node is represented by i'th element of DEP. +If (aref DEP i) is a list (j1 j2 ... jn), it represents that i'th node +depends to j1, j2, ... and jn'th nodes of the graph. + +A result of digraph-tsort is a sequence of sets of indexes for each +strongly connected nodes ordered by indepenedent to dependent as list +of lists of integers. + +If (nth n result) contains an integer i, it represents the fact as +follows. + +1. For all j in (nth n result) and j != i, i'th node depends to j'th +node and vice versa. + +2. For all m < n and j in (nth m result), i'th nodes does not depend +to j'th node." + (let* ((len (length dep)) + (ord (make-vector len nil)) + (i 0) + (res ())) + (while (< i len) + (if (not (aref ord i)) + (setq res (nth 3 (digraph-tsort-visit dep len ord i 0 () res)))) + (setq i (1+ i))) + res)) + +(defun digraph-tsort-visit (dep len ord i id stk res) + (aset ord i id) + (let ((js (aref dep i)) + (m id) + (nid (1+ id)) + (stk (cons i stk)) + (res res)) + (while js + (let* ((j (car js)) (jo (aref ord j))) + (if jo + (setq m (if (< m jo) m jo)) + (let* ((tmp (digraph-tsort-visit dep len ord j nid stk res)) + (m0 (nth 0 tmp))) + (setq m (if (< m m0) m m0) + nid (nth 1 tmp) + stk (nth 2 tmp) + res (nth 3 tmp))))) + (setq js (cdr js))) + (if (= m id) + (let* ((p (member i stk)) + (nstk (cdr p)) + (tmp stk)) + (setcdr p ()) + (while tmp + (aset ord (car tmp) len) + (setq tmp (cdr tmp))) + (list m nid nstk (cons stk res))) + (list m nid stk res)))) + +(defun digraph-reverse (g) + "Return graph with reversed edge." + (let* ((len (length g)) + (rev (make-vector len nil)) + (i 0)) + (while (< i len) + (let ((links (aref g i))) + (while links + (if (not (member i (aref rev (car links)))) + (aset rev (car links) (cons i (aref rev (car links))))) + (setq links (cdr links)))) + (setq i (1+ i))) + rev)) + +(defun digraph-leaves (g) + "Return list of leaves of G." + (let* ((i (length g)) + (res ())) + (while (< 0 i) + (setq i (1- i)) + (if (null (aref g i)) + (setq res (cons i res)))) + res)) + +(defun digraph-roots (g) + "Return list of roots of G." + (digraph-leaves (digraph-reverse g))) + +;;; forest + +(defun digraph-dag-to-forest (dag) + "Convert a DAG(directed acyclic graph) to forest(set of trees)." + (let* ((len (length dag)) + (rev (digraph-reverse dag)) + (forest (digraph-forest-make len)) + (i 0)) + (while (< i len) + (digraph-dag-forest-add-node forest i (aref rev i)) + (setq i (1+ i))) + ;;(error "%s" forest) + (digraph-forest-to-graph forest))) + +(defun digraph-dag-forest-add-node (forest node links) + (if (null links) + (digraph-forest-add-tree forest node) + (let ((parent (car links))) + (setq links (cdr links)) + (digraph-forest-add-node forest parent node) + (while links + (digraph-forest-merge-node forest parent (car links)) + (setq links (cdr links))))) + forest) + +;; forest = [last-tree-id node-merge-map node-parent-map] +(defun digraph-forest-make (num) + (vector 0 (make-vector num nil) (make-vector num nil))) +(defsubst digraph-forest-merge-map (forest) (aref forest 1)) +(defsubst digraph-forest-parent-map (forest) (aref forest 2)) + +(defun digraph-forest-add-tree (forest node) + (aset (digraph-forest-parent-map forest) + node + (aset forest 0 (1- (aref forest 0))))) + +(defun digraph-forest-add-node (forest parent node) + (aset (digraph-forest-parent-map forest) node parent)) + +(defun digraph-forest-node-id (forest node) + (let ((merge-map (digraph-forest-merge-map forest)) tmp) + (while (setq tmp (aref merge-map node)) + (setq node tmp)) + node)) + +(defun digraph-forest-tree-id (forest node) + (setq node (digraph-forest-node-id forest node)) + (let ((parent-map (digraph-forest-parent-map forest)) + tmp) + (while (<= 0 (setq tmp (aref parent-map node))) + (setq node (digraph-forest-node-id forest tmp))) + tmp)) + +(defun digraph-forest-root-p (forest node) + (setq node (digraph-forest-node-id forest node)) + (< (aref (digraph-forest-parent-map forest) node) 0)) + +(defun digraph-forest-path-to-root (forest node) + (setq node (digraph-forest-node-id forest node)) + (let ((parent-map (digraph-forest-parent-map forest)) + (path (list node))) + (while (not (digraph-forest-root-p forest node)) + (setq node (digraph-forest-node-id forest (aref parent-map node)) + path (cons node path))) + path)) + +(defun digraph-forest-merge-node (forest n1 n2) + (setq n1 (digraph-forest-node-id forest n1) + n2 (digraph-forest-node-id forest n2)) + (let ((t1 (digraph-forest-tree-id forest n1)) + (t2 (digraph-forest-tree-id forest n2))) + (if (= t1 t2) + (let ((merge-map (digraph-forest-merge-map forest)) + (p1 (digraph-forest-path-to-root forest n1)) + (p2 (digraph-forest-path-to-root forest n2)) + top) + (while (and p1 p2 + (= (car p1) (car p2))) + (setq top (car p1) + p1 (cdr p1) + p2 (cdr p2))) + (setq p1 (nreverse p1)) + (setq p2 (nreverse p2)) + (while (and p1 p2) + (aset merge-map (car p2) (car p1)) + (setq p1 (cdr p1) + p2 (cdr p2))) + (if (or p1 p2) + (let ((ns (nconc p1 p2)) n) + (while ns + (aset merge-map (car ns) top) + (setq ns (cdr ns)))))) + (let ((merge-map (digraph-forest-merge-map forest)) + (parent-map (digraph-forest-parent-map forest))) + (while (and (not (digraph-forest-root-p forest n1)) + (not (digraph-forest-root-p forest n2))) + (aset merge-map n2 n1) + (setq n1 (digraph-forest-node-id forest (aref parent-map n1)) + n2 (digraph-forest-node-id forest (aref parent-map n2)))) + (if (digraph-forest-root-p forest n2) + (aset merge-map n2 n1) + (aset merge-map n1 n2))))) + forest) + +(defun digraph-forest-to-graph (forest) + (let* ((merge-map (digraph-forest-merge-map forest)) + (parent-map (digraph-forest-parent-map forest)) + (old-len (length merge-map)) + (old-to-new (make-vector old-len nil)) + new-to-olds + (new-len 0) + (i 0) + j + graph + ) + (while (< i old-len) + (setq j (digraph-forest-node-id forest i)) + (if (aref old-to-new j) + (aset old-to-new i (aref old-to-new j)) + (progn + (aset old-to-new j new-len) + (aset old-to-new i new-len) + (setq new-len (1+ new-len)))) + (setq i (1+ i))) + (setq new-to-olds (make-vector new-len nil) + graph (make-vector new-len nil)) + (setq i (1- old-len)) + (while (<= 0 i) + (setq j (aref old-to-new i)) + (aset new-to-olds j + (cons i (aref new-to-olds j))) + (setq i (1- i))) + (setq i 0) + (while (< i new-len) + (setq j (aref parent-map (digraph-forest-node-id forest (car (aref new-to-olds i))))) + (if (<= 0 j) (aset graph (aref old-to-new j) + (cons i (aref graph (aref old-to-new j))))) + (setq i (1+ i))) + (vector graph old-to-new new-to-olds))) + +;;; utilities + +(defun digraph-list-uniq (il) + (if (null il) + () + (if (member (car il) (cdr il)) + (digraph-list-uniq (cdr il)) + (cons (car il) (digraph-list-uniq (cdr il)))))) + +'( +(digraph-split-as-forest [nil nil nil nil nil nil nil nil nil nil (10) nil nil (12) (13) (0 1 2 3 4 5 6 7 8 9 10 11 13 14)]) +[ + [(14 13 12 11 10 9 8 7 6 5 4 3 1) (2) nil nil nil nil nil nil nil nil nil nil nil nil nil] + [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 0] + [(15) (14) (13) (12) (11) (10) (9) (8) (7) (6) (5) (4) (3) (2) (1) (0)]] + +) \ No newline at end of file diff --git a/ew-data.el b/ew-data.el new file mode 100644 index 0000000..e45f9d7 --- /dev/null +++ b/ew-data.el @@ -0,0 +1,138 @@ +(provide 'ew-data) + +(defun ew-make-anchor (column str) + (let ((anchor (make-symbol str))) + (put anchor 'anchor anchor) + (put anchor 'prev-frag anchor) + (put anchor 'next-frag anchor) + (put anchor 'prev-token anchor) + (put anchor 'next-token anchor) + (put anchor 'column column) + (put anchor 'line-length 0) + anchor)) + +(defun ew-terminate (anchor) + (let ((frag (get anchor 'prev-frag)) + (line-length (get anchor 'column))) + (while (null (get frag 'line-length)) + (put frag 'line-length line-length) + (setq frag (get frag 'prev-frag))))) + +(defun ew-tokenize-frag (anchor frag) + (put frag 'prev-token (get anchor 'prev-token)) + (put frag 'next-token anchor) + (put (get anchor 'prev-token) 'next-token frag) + (put anchor 'prev-token frag) + frag) + +(defun ew-add-frag (anchor start end type) + (let ((frag (make-symbol (substring (symbol-name anchor) start end)))) + (put frag 'anchor anchor) + (put frag 'start start) + (put frag 'end end) + (put frag 'type type) + (put frag 'prev-frag (get anchor 'prev-frag)) + (put frag 'next-frag anchor) + (put (get anchor 'prev-frag) 'next-frag frag) + (put anchor 'prev-frag frag) + (put frag 'decode (or (get type 'decode) 'ew-decode-none)) + (if (string-match "\r\n\\(.*\r\n\\)*" (symbol-name frag)) + (let ((prev-line-length (+ (get anchor 'column) (match-beginning 0))) + (next-line-column (- (length (symbol-name frag)) (match-end 0))) + (tmp frag)) + (while (null (get tmp 'line-length)) + (put tmp 'line-length prev-line-length) + (setq tmp (get tmp 'prev-frag))) + (put anchor 'column next-line-column)) + (put anchor 'column (+ (get anchor 'column) (length (symbol-name frag))))) + frag)) + +(defun ew-add-open (anchor start end type) + (let ((frag (ew-add-frag anchor start end type))) + (put frag 'prev-open (get anchor 'prev-open)) + (put anchor 'prev-open frag) + frag)) + +(defun ew-add-close (anchor start end type) + (let ((frag (ew-add-frag anchor start end type))) + (put frag 'pair (get anchor 'prev-open)) + (put (get anchor 'prev-open) 'pair frag) + (put anchor 'prev-open (get (get frag 'pair) 'prev-open)) + frag)) + +(defun ew-add-token (anchor start end type) + (ew-tokenize-frag anchor (ew-add-frag anchor start end type))) + +(defun ew-add-close-token (anchor start end type) + (ew-tokenize-frag anchor (ew-add-close anchor start end type))) + +;;; listup + +(defun ew-frag-list (anchor) + (let ((res ()) + (tmp (get anchor 'prev-frag))) + (while (not (eq anchor tmp)) + (setq res (cons tmp res) + tmp (get tmp 'prev-frag))) + res)) + +(defun ew-token-list (anchor) + (let ((res ()) + (tmp (get anchor 'prev-token))) + (while (not (eq anchor tmp)) + (setq res (cons tmp res) + tmp (get tmp 'prev-token))) + res)) + +(defun ew-pair-list (anchor) + (mapcar + (lambda (frag) + (cons (symbol-value (get frag 'type)) + frag)) + (ew-frag-list anchor))) + +;;; phrase marking + +(defun ew-mark-phrase (frag1 frag2) + (while (not (eq frag1 frag2)) + (unless (ew-comment-frag-p frag2) + (put frag2 'decode 'ew-decode-phrase)) + (setq frag2 (get frag2 'prev-frag))) + (unless (ew-comment-frag-p frag2) + (put frag2 'decode 'ew-decode-phrase)) + (setq frag2 (get frag2 'prev-frag)) + (while (not (get frag2 'prev-token)) + (unless (ew-comment-frag-p frag2) + (put frag2 'decode 'ew-decode-phrase)) + (setq frag2 (get frag2 'prev-frag)))) + +;;; frag predicate + +(defun ew-comment-frag-p (frag) + (member (get frag 'type) + '(ew:raw-cm-begin-tok + ew:raw-cm-end-tok + ew:raw-cm-nested-begin-tok + ew:raw-cm-nested-end-tok + ew:raw-cm-texts-tok + ew:raw-cm-wsp-tok + ew:raw-cm-fold-tok + ew:raw-cm-qfold-tok + ew:raw-cm-qpair-tok))) + +(defun ew-special-frag-p (frag) + (or (eq frag (get frag 'anchor)) + (member (get frag 'type) + '(ew:raw-lt-tok + ew:raw-gt-tok + ew:raw-at-tok + ew:raw-comma-tok + ew:raw-semicolon-tok + ew:raw-colon-tok + ew:raw-dot-tok + ew:raw-qs-begin-tok + ew:raw-qs-end-tok + ew:raw-dl-begin-tok + ew:raw-dl-end-tok + ew:raw-cm-begin-tok + ew:raw-cm-end-tok)))) diff --git a/ew-dec.el b/ew-dec.el new file mode 100644 index 0000000..e805aee --- /dev/null +++ b/ew-dec.el @@ -0,0 +1,414 @@ +(require 'emu) +(require 'ew-unit) +(require 'ew-scan-s) +(require 'ew-scan-m) +(require 'ew-scan-u) +(require 'ew-parse) +(provide 'ew-dec) + +;;; user customizable variable. + +(defvar ew-decode-quoted-encoded-word nil) +(defvar ew-ignore-75bytes-limit nil) +(defvar ew-ignore-76bytes-limit nil) +(defvar ew-permit-sticked-comment nil) +(defvar ew-permit-sticked-special nil) + +;; anonymous function to decode ground string. +;; NOTE: STR is CRLF-form and it should return as CRLF-form. +(defvar ew-decode-us-ascii (lambda (str) (decode-coding-string str 'iso-latin-1-unix))) + +;;; +(defvar ew-decode-field-syntax-alist +'(("from" ew-scan-unibyte-std11 . ew:tag-mailbox+-tok) + ("sender" ew-scan-unibyte-std11 . ew:tag-mailbox-tok) + ("to" ew-scan-unibyte-std11 . ew:tag-address+-tok) + ("resent-to" ew-scan-unibyte-std11 . ew:tag-address+-tok) + ("cc" ew-scan-unibyte-std11 . ew:tag-address+-tok) + ("resent-cc" ew-scan-unibyte-std11 . ew:tag-address+-tok) + ("bcc" ew-scan-unibyte-std11 . ew:tag-address*-tok) + ("resent-bcc" ew-scan-unibyte-std11 . ew:tag-address*-tok) + ("message-id" ew-scan-unibyte-std11) + ("resent-message-id" ew-scan-unibyte-std11) + ("in-reply-to" ew-scan-unibyte-std11 . ew:tag-phrase-msg-id*-tok) + ("references" ew-scan-unibyte-std11 . ew:tag-phrase-msg-id*-tok) + ("keywords" ew-scan-unibyte-std11 . ew:tag-phrase*-tok) + ("subject" ew-scan-unibyte-unstructured) + ("comments" ew-scan-unibyte-unstructured) + ("encrypted" ew-scan-unibyte-std11) + ("date" ew-scan-unibyte-std11) + ("reply-to" ew-scan-unibyte-std11 . ew:tag-address+-tok) + ("received" ew-scan-unibyte-std11) + ("resent-reply-to" ew-scan-unibyte-std11 . ew:tag-address+-tok) + ("resent-from" ew-scan-unibyte-std11 . ew:tag-mailbox+-tok) + ("resent-sender" ew-scan-unibyte-std11 . ew:tag-mailbox-tok) + ("resent-date" ew-scan-unibyte-std11) + ("return-path" ew-scan-unibyte-std11) + ("mime-version" ew-scan-unibyte-std11) + ("content-type" ew-scan-unibyte-mime) + ("content-transfer-encoding" ew-scan-unibyte-mime) + ("content-id" ew-scan-unibyte-mime) + ("content-description" ew-scan-unibyte-unstructured) +)) +(defvar ew-decode-field-default-syntax '(ew-scan-unibyte-unstructured)) + +(defun ew-decode-field (field-name field-body &optional eword-filter) + "Decode MIME RFC2047 encoded-words in a field. +FIELD-NAME is a name of the field such as \"To\", \"Subject\" etc. and +used to selecting syntax of body of the field and deciding first +column of body of the field. +FIELD-BODY is a body of the field. + +If FIELD-BODY has multiple lines, each line is separated by CRLF as +pure network representation. Also if the result has multiple lines, +each line is separated by CRLF. + +If EWORD-FILTER is non-nil, it should be closure. it is called for +each successful decoded encoded-word with decoded string as a +argument. The return value of EWORD-FILTER is used as decoding result +instead of its argument." + (let ((tmp (assoc (downcase field-name) ew-decode-field-syntax-alist)) + frag-anchor frag1 frag2 decode) + (if tmp + (setq tmp (cdr tmp)) + (setq tmp ew-decode-field-default-syntax)) + (setq frag-anchor (funcall (car tmp) (1+ (length field-name)) field-body)) + ;;(setq zzz frag-anchor) + (when (cdr tmp) + (ew-mark (cdr tmp) frag-anchor)) + (setq frag1 (get frag-anchor 'next-frag)) + (while (not (eq frag1 frag-anchor)) + (setq decode (get frag1 'decode)) + (setq frag2 (get frag1 'next-frag)) + (while (and (not (eq frag2 frag-anchor)) + (eq decode (get frag2 'decode))) + (setq frag2 (get frag2 'next-frag))) + (funcall decode frag-anchor frag1 frag2 eword-filter) + (setq frag1 frag2)) + (mapconcat (lambda (frag) (or (get frag 'result) (symbol-name frag))) + (ew-frag-list frag-anchor) ""))) + +(defun ew-mark (tag anchor) + (let ((tlist (cons (list (symbol-value tag)) (ew-pair-list anchor)))) + ;;(insert (format "%s" tlist)) + (ew-parse + (lambda () (if (null tlist) '(0) + (prog1 (car tlist) (setq tlist (cdr tlist))))) + (lambda (msg tok) (setq zzz-anchor anchor) (message "parse error: %s%s : %s" msg tok anchor))))) + +(defun ew-decode-none (anchor frag end eword-filter) + (while (not (eq frag end)) + (put frag 'result (funcall ew-decode-us-ascii (symbol-name frag))) + (setq frag (get frag 'next-frag)))) + +(defun ew-decode-generic (anchor start end + decode-ewords + decode-others + eword gap all + eword-filter) + (let ((frag start) result buff type f) + (while (not (eq frag end)) + (setq type (get frag 'type)) + (cond + ((and (memq type eword) + (ew-proper-eword-p frag)) + (when buff + (setq result (ew-rappend result + (funcall decode-others + (nreverse buff))) + buff ())) + (let ((first frag) (ewords (list frag))) + (while (progn + (setq f (get frag 'next-frag)) + (while (and (not (eq f end)) + (memq (get f 'type) gap)) + (setq f (get f 'next-frag))) + (and (not (eq f end)) + (ew-proper-eword-p f))) + (setq ewords (ew-rcons* ewords f) + frag f)) + (while (not (eq first frag)) + (put first 'result "") + (setq first (get first 'next-frag))) + (put frag 'result "") + (setq result (ew-rappend result + (funcall decode-ewords + (nreverse ewords) + eword-filter))))) + ((memq type all) + (setq buff (cons frag buff)) + (put frag 'result "")) + (t + (error "unexpected token: %s (%s)" frag type))) + (setq frag (get frag 'next-frag))) + (when buff + (setq result (ew-rappend result (funcall decode-others (nreverse buff))))) + (put start 'result + (apply 'ew-quote-concat (nreverse result))) + )) + +(defun ew-decode-generic-others (frags puncts quotes targets) + (let (result buff frag type tmp) + (while frags + (setq frag (car frags) + type (get frag 'type) + frags (cdr frags)) + (cond + ((memq type puncts) + (when buff + (setq buff (nreverse buff) + tmp (funcall ew-decode-us-ascii + (mapconcat 'car buff ""))) + (if (ew-contain-non-ascii-p tmp) + (setq result (ew-rcons* result tmp)) + (setq result (ew-rcons* + result + (funcall ew-decode-us-ascii + (mapconcat 'cdr buff ""))))) + (setq buff ())) + (setq result (ew-rcons* + result + (symbol-name frag)))) + ((memq type quotes) + (setq buff (ew-rcons* + buff + (cons (substring (symbol-name frag) 1) + (symbol-name frag))))) + ((memq type targets) + (setq buff (ew-rcons* + buff + (cons (symbol-name frag) + (symbol-name frag))))) + (t + (error "something wrong: unexpected token: %s (%s)" frag type)))) + (when buff + (setq buff (nreverse buff) + tmp (funcall ew-decode-us-ascii + (mapconcat 'car buff ""))) + (if (ew-contain-non-ascii-p tmp) + (setq result (ew-rcons* result tmp)) + (setq result (ew-rcons* + result + (funcall ew-decode-us-ascii + (mapconcat 'cdr buff ""))))) + (setq buff ())) + (nreverse result))) + +(defun ew-decode-unstructured-ewords (ewords eword-filter) + (let (result) + (while ewords + (setq result (ew-rcons* + result + (list (ew-decode-eword (symbol-name (car ewords)) + eword-filter + 'ew-encode-crlf))) + ewords (cdr ewords))) + (nreverse result))) + +(defun ew-decode-unstructured-others (frags) + (let (result) + (while frags + (setq result (ew-rcons* + result + (symbol-name (car frags))) + frags (cdr frags))) + (list (funcall ew-decode-us-ascii + (apply 'concat (nreverse result)))))) + +(defun ew-decode-unstructured (anchor start end eword-filter) + (ew-decode-generic + anchor start end + 'ew-decode-unstructured-ewords + 'ew-decode-unstructured-others + '(ew:raw-us-texts-tok) + '(ew:raw-us-wsp-tok + ew:raw-us-fold-tok) + '(ew:raw-us-texts-tok + ew:raw-us-wsp-tok + ew:raw-us-fold-tok) + eword-filter)) + +(defun ew-decode-phrase-ewords (ewords eword-filter) + (let ((qs (eq (get (car ewords) 'type) 'ew:raw-qs-texts-tok)) + require-quoting + result) + (while ewords + (setq result (ew-rcons* + result + (list (ew-decode-eword (symbol-name (car ewords)) + eword-filter + 'ew-encode-crlf))) + require-quoting (or require-quoting + (string-match "[][()<>@,;:\\\".\000-\037]" + (caar result))) + ewords (cdr ewords))) + (if require-quoting + (list + (funcall (if qs 'ew-embed-in-quoted-string 'ew-embed-in-phrase) + (apply 'ew-quote-concat + (nreverse result)))) + (nreverse result)))) + +(defun ew-decode-phrase-others (frags) + (ew-decode-generic-others + frags + '(ew:raw-qs-begin-tok + ew:raw-qs-end-tok) + '(ew:raw-qs-qfold-tok + ew:raw-qs-qpair-tok) + '(ew:raw-atom-tok + ew:raw-wsp-tok + ew:raw-fold-tok + ew:raw-qs-texts-tok + ew:raw-qs-wsp-tok + ew:raw-qs-fold-tok))) + +(defun ew-decode-phrase (anchor start end eword-filter) + (ew-decode-generic + anchor start end + 'ew-decode-phrase-ewords + 'ew-decode-phrase-others + (if ew-decode-quoted-encoded-word + '(ew:raw-atom-tok ew:raw-qs-texts-tok) + '(ew:raw-atom-tok)) + '(ew:raw-wsp-tok + ew:raw-fold-tok) + '(ew:raw-atom-tok + ew:raw-wsp-tok + ew:raw-fold-tok + ew:raw-qs-begin-tok + ew:raw-qs-end-tok + ew:raw-qs-texts-tok + ew:raw-qs-wsp-tok + ew:raw-qs-fold-tok + ew:raw-qs-qfold-tok + ew:raw-qs-qpair-tok) + eword-filter)) + +(defun ew-decode-comment-ewords (ewords eword-filter) + (let (require-quoting + result) + (while ewords + (setq result (ew-rcons* + result + (list (ew-decode-eword (symbol-name (car ewords)) + eword-filter + 'ew-encode-crlf))) + require-quoting (or require-quoting + (string-match "[()\\\\]" (caar result))) + ewords (cdr ewords))) + (if require-quoting + (list + (ew-embed-in-comment + (apply 'ew-quote-concat + (nreverse result)))) + (nreverse result)))) + +(defun ew-decode-comment-others (frags) + (ew-decode-generic-others + frags + '() + '(ew:raw-cm-qfold-tok + ew:raw-cm-qpair-tok) + '(ew:raw-cm-texts-tok + ew:raw-cm-wsp-tok + ew:raw-cm-fold-tok))) + +(defun ew-decode-comment (anchor start end eword-filter) + (ew-decode-generic + anchor start end + 'ew-decode-comment-ewords + 'ew-decode-comment-others + '(ew:raw-cm-texts-tok) + '(ew:raw-cm-wsp-tok + ew:raw-cm-fold-tok) + '(ew:raw-cm-texts-tok + ew:raw-cm-wsp-tok + ew:raw-cm-fold-tok + ew:raw-cm-qfold-tok + ew:raw-cm-qpair-tok) + eword-filter)) + +;;; + +(defun ew-embed-in-phrase (str) + (concat "\"" (ew-embed-in-quoted-string str) "\"")) + +(defun ew-embed-in-quoted-string (str) + (ew-quote-as-quoted-pair str '(?\\ ?\"))) + +(defun ew-embed-in-comment (str) + (ew-quote-as-quoted-pair str '(?\\ ?\( ?\)))) + +(defun ew-quote-as-quoted-pair (str specials) + (let ((i 0) (j 0) (l (length str)) result) + (while (< j l) + (when (member (aref str j) specials) + (setq result (ew-rcons* + result + (substring str i j) + "\\") + i j)) + (setq j (1+ j))) + (when (< i l) + (setq result (ew-rcons* + result + (substring str i)))) + (apply 'concat (nreverse result)))) + +;;; + +(defun ew-proper-eword-p (frag) + (and + (or ew-ignore-75bytes-limit + (<= (length (symbol-name frag)) 75)) + (or ew-ignore-76bytes-limit + (<= (get frag 'line-length) 76)) + (cond + ((eq (get frag 'type) 'ew:raw-cm-texts-tok) + (ew-eword-p (symbol-name frag))) + ((eq (get frag 'type) 'ew:raw-qs-texts-tok) + (ew-eword-p (symbol-name frag))) + ((eq (get frag 'type) 'ew:raw-atom-tok) + (and + (or ew-permit-sticked-comment + (and + (not (ew-comment-frag-p (get frag 'prev-frag))) + (not (ew-comment-frag-p (get frag 'next-frag))))) + (or ew-permit-sticked-special + (and + (or (ew-comment-frag-p (get frag 'prev-frag)) + (not (ew-special-frag-p (get frag 'prev-frag)))) + (or (ew-comment-frag-p (get frag 'next-frag)) + (not (ew-special-frag-p (get frag 'next-frag)))))) + (ew-eword-p (symbol-name frag)))) + ((eq (get frag 'type) 'ew:raw-us-texts-tok) + (and + (or ew-permit-sticked-special + (not (ew-special-frag-p (get frag 'prev-frag)))) + (ew-eword-p (symbol-name frag)))) + (t + nil)))) + +(defun ew-contain-non-ascii-p (str) + (not (eq (charsets-to-mime-charset (find-charset-string str)) 'us-ascii))) + +'( + +(ew-decode-field "To" " =?US-ASCII?Q?phrase?= ") +(ew-decode-field "To" " =?US-ASCII?Q?phrase?= < =?US-ASCII?Q?akr?= @jaist.ac.jp>") +(ew-decode-field "To" " =?US-ASCII?Q?akr?= @jaist.ac.jp") +(ew-decode-field "Subject" " =?ISO-2022-JP?B?GyRCJCIbKEI=?=") +(ew-decode-field "Content-Type" " text/vnd.latex-z(=?US-ASCII?Q?What=3F?=);charset=ISO-2022-JP") + +(ew-decode-field "To" " =?US-ASCII?Q?A=22B=5CC?= ") +(let ((ew-decode-quoted-encoded-word t)) + (ew-decode-field "To" " \"=?US-ASCII?Q?A=22B=5CC?=\" ")) + +(ew-decode-field "To" " akr@jaist.ac.jp (=?US-ASCII?Q?=28A=29B=5C?=)") + +(ew-decode-field "To" "\"A\\BC\e$B\\\"\\\\\e(B\" ") +(ew-decode-field "To" "\"A\\BC\" ") +(ew-decode-field "To" "\"\e\\$\\B\\$\\\"\e\\(\\B\" ") + +) diff --git a/ew-line.el b/ew-line.el new file mode 100644 index 0000000..c12bced --- /dev/null +++ b/ew-line.el @@ -0,0 +1,174 @@ +(require 'lex) +(require 'ew-util) +(provide 'ew-line) + +(put 'ew-line-generic 'lisp-indent-function 1) +(put 'ew-line-convert 'lisp-indent-function 1) + +(defun ew-lf-to-crlf (str) + (let ((i 0) (j 0) (l (length str)) result) + (while (< j l) + (when (equal (aref str j) ?\n) + (setq result (ew-rcons* + result + (substring str i j) + "\r") + i j)) + (setq j (1+ j))) + (when (< i l) + (setq result (ew-rcons* + result + (substring str i)))) + (apply 'concat (nreverse result)))) + +(defun ew-crlf-to-lf (str) + (let* ((i 0) (j 0) (l (length str)) (l- (1- l)) result) + (while (< j l-) + (when (and (equal (aref str j) ?\r) + (equal (aref str (1+ j)) ?\n)) + (setq result (ew-rcons* + result + (substring str i j)) + j (1+ j) + i j)) + (setq j (1+ j))) + (when (< i l) + (setq result (ew-rcons* + result + (substring str i)))) + (apply 'concat (nreverse result)))) + +(defun ew-lf-crlf-to-crlf (str) + (let* ((i 0) (j 0) (l (length str)) (l- (1- l)) result) + (while (< j l) + (cond + ((and (< j l-) + (equal (aref str j) ?\r) + (equal (aref str (1+ j)) ?\n)) + (setq j (1+ j))) + ((equal (aref str j) ?\n) + (setq result (ew-rcons* + result + (substring str i j) + "\r") + i j))) + (setq j (1+ j))) + (when (< i l) + (setq result (ew-rcons* + result + (substring str i)))) + (apply 'concat (nreverse result)))) + +(defun ew-crlf-unfold (str) + (let* ((i 0) (j 0) (l (length str)) (l- (- l 2)) result) + (while (< j l-) + (when (and (equal (aref str j) ?\r) + (equal (aref str (1+ j)) ?\n) + (member (aref str (+ j 2)) '(?\t ?\ ))) + (setq result (ew-rcons* + result + (substring str i j)) + j (+ j 2) + i j)) + (setq j (1+ j))) + (when (< i l) + (setq result (ew-rcons* + result + (substring str i)))) + (apply 'concat (nreverse result)))) + +(defun ew-lf-unfold (str) + (let* ((i 0) (j 0) (l (length str)) (l- (- l 1)) result) + (while (< j l-) + (when (and (equal (aref str j) ?\n) + (member (aref str (+ j 1)) '(?\t ?\ ))) + (setq result (ew-rcons* + result + (substring str i j)) + j (+ j 1) + i j)) + (setq j (1+ j))) + (when (< i l) + (setq result (ew-rcons* + result + (substring str i)))) + (apply 'concat (nreverse result)))) + +(defun ew-cut-cr-lf (str) + (let ((i 0) (j 0) (l (length str)) result) + (while (< j l) + (when (member (aref str j) '(?\r ?\n)) + (setq result (ew-rcons* + result + (substring str i j)) + i (1+ j))) + (setq j (1+ j))) + (when (< i l) + (setq result (ew-rcons* + result + (substring str i)))) + (apply 'concat (nreverse result)))) + +(defmacro ew-line-generic-define () + (let ((str (make-symbol "str")) + (others-fun (make-symbol "others-fun")) + (fold-fun (make-symbol "fold-fun")) + (crlf-fun (make-symbol "crlf-fun")) + (bare-cr-fun (make-symbol "bare-cr-fun")) + (bare-lf-fun (make-symbol "bare-lf-fun")) + (p (make-symbol "p")) + (q (make-symbol "q")) + (r (make-symbol "r"))) + `(defun ew-line-generic + (,str ,others-fun ,fold-fun ,crlf-fun ,bare-cr-fun ,bare-lf-fun) + (let ((,p 0) (,q (length ,str)) ,r) + (while (< ,p ,q) + (setq ,r ,p) + (lex-scan-unibyte ,str ,p ,q + ((+ [^ "\r\n"]) (when ,others-fun (funcall ,others-fun ,r ,p))) + ((?\r ?\n [" \t"]) (when ,fold-fun (funcall ,fold-fun ,r ,p))) + ((?\r ?\n) (when ,crlf-fun (funcall ,crlf-fun ,r ,p))) + ((?\r) (when ,bare-cr-fun (funcall ,bare-cr-fun ,r ,p))) + ((?\n) (when ,bare-lf-fun (funcall ,bare-lf-fun ,r ,p))) + (() (error "something wrong")))) + ,q)))) + +(ew-line-generic-define) + +(defmacro ew-line-convert-define () + (let ((str (make-symbol "str")) + (others-fun (make-symbol "others-fun")) + (fold-fun (make-symbol "fold-fun")) + (crlf-fun (make-symbol "crlf-fun")) + (bare-cr-fun (make-symbol "bare-cr-fun")) + (bare-lf-fun (make-symbol "bare-lf-fun")) + (index (make-symbol "index")) + (result (make-symbol "result")) + (start (make-symbol "starx")) + (end (make-symbol "end"))) + `(defun ew-line-convert + (,str ,others-fun ,fold-fun ,crlf-fun ,bare-cr-fun ,bare-lf-fun) + (let ((,index 0) ,result) + (when (> (ew-line-generic + ,str + ,@(mapcar + (lambda (fun) + `(when ,fun + (lambda (,start ,end) + (when (< ,index ,start) + (setq ,result + (ew-rcons* ,result + (substring ,str ,index ,start)))) + (setq ,result + (ew-rcons* ,result + (funcall ,fun + (substring ,str ,start ,end))) + ,index ,end)))) + (list others-fun fold-fun crlf-fun bare-cr-fun bare-lf-fun))) + ,index) + (setq ,result + (ew-rcons* ,result + (substring ,str ,index)))) + (apply 'concat (nreverse ,result)))))) + +(ew-line-convert-define) diff --git a/ew-quote.el b/ew-quote.el new file mode 100644 index 0000000..66806b2 --- /dev/null +++ b/ew-quote.el @@ -0,0 +1,199 @@ +;;; quoted encoded word library + +(require 'ew-util) +(require 'ew-line) +(provide 'ew-quote) + +;; This library provides functions operating strings embedding +;; unencodable encoded words. + +;;;type embedded-pattern denoted-pattern +;; Type-0 =?(QQQ)*A?B?C?= =?(Q)*A?B?C?= +;; Type-1 =?(QQQ)*QA?B?C?= Decode =?(Q)*A?B?C?= as encoded-word +;; Type-2 =?(QQQ)*QQA?B?C?= =?(Q)*A?B?C? + +;; Q : quoting character '+'. +;; A : token. But it does not start with quoting character. +;; B : token. +;; C : encoded-text. + +(defconst ew-quoting-char ?+) +(defconst ew-quoting-char-singleton (list ew-quoting-char)) +(defconst ew-quoting-chars-regexp + (concat (regexp-quote (char-to-string ew-quoting-char)) "*")) + +(defconst ew-token-regexp "[-!#-'*+0-9A-Z^-~]+") +(defconst ew-encoded-text-regexp "[!->@-~]+") +(defconst ew-encoded-word-regexp + (concat (regexp-quote "=?") + "\\(" ew-token-regexp "\\)" + (regexp-quote "?") + "\\(" ew-token-regexp "\\)" + (regexp-quote "?") + "\\(" ew-encoded-text-regexp "\\)" + (regexp-quote "?="))) + +(defconst ew-type2-regexp + (concat (regexp-quote "=?") + "\\(" ew-token-regexp "\\)" + (regexp-quote "?") + "\\(" ew-token-regexp "\\)" + (regexp-quote "?") + "\\(" ew-encoded-text-regexp "\\)" + (regexp-quote "?") + "\\'")) + +;;; + +(defun ew-quoting-char-seq (num) + (make-string num ew-quoting-char)) + +(defun ew-quote (str) + (ew-quote-concat str)) + +(defun ew-concat (&rest args) + (apply 'ew-quote-concat (mapcar 'list args))) + +(defun ew-quote-concat (&rest args) + (let (result raws tmp) + (while args + (setq tmp (car args)) + (cond + ((stringp tmp) + ;; raw string + (setq raws (cons tmp raws))) + ((listp tmp) + ;; quoted encoded word embedding strings + (let (str start eword-start charset-start quoting-end eword-end l q r) + (while tmp + (setq str (car tmp) + start 0) + (while (string-match ew-encoded-word-regexp str start) + (setq eword-start (match-beginning 0) + charset-start (match-beginning 1) + eword-end (match-end 0)) + (string-match ew-quoting-chars-regexp str charset-start) + (setq quoting-end (match-end 0) + l (- quoting-end charset-start) + q (/ l 3) + r (% l 3)) + (cond + ((= r 0) ; Type-0 + (setq raws + (ew-rcons* + raws + (substring str start charset-start) + (ew-quoting-char-seq q) + (substring str quoting-end eword-end)))) + ((= r 2) ; Type-2 + (setq raws + (ew-rcons* + raws + (substring str start charset-start) + (ew-quoting-char-seq q) + (substring str quoting-end (1- eword-end))))) + ((= r 1) ; Type-1 + (setq raws + (ew-rcons* + raws + (substring str start eword-start)) + result + (ew-rcons* + result + (ew-quote-sole (apply 'concat (nreverse raws)) t) + (substring str eword-start eword-end)) + raws ()))) + (setq start eword-end)) + (setq raws (ew-rcons* raws (substring str start)) + tmp (cdr tmp))))) + (t + (error "ew-quote-concat: %s" tmp))) + (setq args (cdr args))) + (setq result + (ew-rcons* + result + (ew-quote-sole (apply 'concat (nreverse raws)) nil))) + (apply 'concat (nreverse result)))) + +(defun ew-quote-sole (str gen-type2) + (let (result (start 0) charset-start quoting-end eword-end l) + (while (string-match ew-encoded-word-regexp str start) + (setq charset-start (match-beginning 1) + eword-end (match-end 0)) + (string-match ew-quoting-chars-regexp str charset-start) + (setq quoting-end (match-end 0) + l (* (- quoting-end charset-start) 3) + result + (ew-rcons* + result + (substring str start charset-start) + (ew-quoting-char-seq l) + (substring str quoting-end eword-end)) + start eword-end)) + (if (and gen-type2 + (string-match ew-type2-regexp str start)) + (progn + (setq charset-start (match-beginning 1) + eword-end (match-end 0)) + (string-match ew-quoting-chars-regexp str charset-start) + (setq quoting-end (match-end 0) + l (* (- quoting-end charset-start) 3) + result + (ew-rcons* + result + (substring str start charset-start) + (ew-quoting-char-seq (+ l 2)) + (substring str quoting-end eword-end) + "="))) + (setq result (ew-rcons* result (substring str start)))) + (apply 'concat (nreverse result)))) + +(defun ew-quote-eword (charset encoding encoded-text) + (string-match ew-quoting-chars-regexp charset) + (concat + "=?+" ; Type-1 + (ew-quoting-char-seq (* (- (match-end 0) (match-beginning 0)) 3)) + (substring charset (match-end 0)) + "?" + encoding + "?" + encoded-text + "?=")) + +(defun ew-encode-crlf (str) + (let ((sstart 0) + (mstart 0) + (end (length str)) result ms me) + (while (string-match "\\(\r\n\\)+" str mstart) + (setq ms (match-beginning 0) + me (match-end 0)) + (setq mstart me) + (when (and (< me end) + (member (aref str me) '(?\t ?\ ))) + (setq me (- me 2))) + (when (< ms me) + (setq result (ew-rcons* result + (substring str sstart ms) + "=?+US-ASCII?Q?") + sstart me) + (while (< ms me) + (setq result (ew-rcons* result "=0D=0A") + ms (+ ms 2))) + (setq result (ew-rcons* result "?=")))) + (when (< sstart end) + (setq result (ew-rcons* result + (substring str sstart)))) + (apply 'concat (nreverse result)))) + + +'( +(ew-quote-concat "aaa=?A?B?C?=ccc") ;"aaa=?A?B?C?=ccc" +(ew-quote-concat "aaa=?+A?B?C?=ccc") ;"aaa=?+++A?B?C?=ccc" +(ew-quote-concat '("aaa=?A?B?C?=ccc")) ;"aaa=?A?B?C?=ccc" +(ew-quote-concat '("aaa=?+++A?B?C?=ccc")) ;"aaa=?+++A?B?C?=ccc" +(ew-quote-concat "aaa=?+A?B" "?C?=ccc") ;"aaa=?+++A?B?C?=ccc" +(ew-quote-concat "a=?+A?B?C?" '("=?+US-ASCII?Q?z?=")) ;"a=?+++++A?B?C?==?+US-ASCII?Q?z?=" +(ew-quote-concat "a=?+A?B?C?=?+D?E?F?" '("=?+US-ASCII?Q?z?=")) ;"a=?+++A?B?C?=?+D?E?F?=?+US-ASCII?Q?z?=" +(ew-quote-concat "a=?+A?B?C?=?+D?E?F?=?+G?H?I?" '("=?+US-ASCII?Q?z?=")) ;"a=?+++A?B?C?=?+D?E?F?=?+++++G?H?I?==?+US-ASCII?Q?z?=" +(ew-quote-concat '("a=?++A?B?C?==?+++A?B?C?=c")) ;"a=?A?B?C?=?+A?B?C?=c" +) \ No newline at end of file diff --git a/ew-scan-m.el b/ew-scan-m.el new file mode 100644 index 0000000..6a99ff9 --- /dev/null +++ b/ew-scan-m.el @@ -0,0 +1,152 @@ +(require 'lex) +(require 'automata) +(require 'ew-data) +(require 'ew-parse) +(provide 'ew-scan-m) + +(defmacro ew-scan-mime (scan col str) + `(let ((res (ew-make-anchor col str)) + (mode 'token) + (p 0) + (q (length str)) + r + nest) + (while (< p q) + (setq r p) + (cond + ((eq mode 'token) + (,scan + str p q + ([" \t"] (ew-add-frag res r p 'ew:raw-wsp-tok)) + (?< (ew-add-token res r p 'ew:raw-lt-tok)) + (?> (ew-add-token res r p 'ew:raw-gt-tok)) + (?@ (ew-add-token res r p 'ew:raw-at-tok)) + (?, (ew-add-token res r p 'ew:raw-comma-tok)) + (?\; (ew-add-token res r p 'ew:raw-semicolon-tok)) + (?: (ew-add-token res r p 'ew:raw-colon-tok)) + (?/ (ew-add-token res r p 'ew:raw-slash-tok)) + (?? (ew-add-token res r p 'ew:raw-question-tok)) + (?= (ew-add-token res r p 'ew:raw-equal-tok)) + ((?\r ?\n [" \t"]) + (ew-add-frag res r p 'ew:raw-fold-tok)) + ((?\r ?\n [^ " \t"]) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + ((+ [(?a ?z) (?A ?Z) (?0 ?9) "!#$%&'*+-.^_`{|}~" non-ascii]) + (ew-add-token res r p 'ew:raw-atom-tok)) + (?\" (ew-add-open res r p 'ew:raw-qs-begin-tok) + (setq mode 'quoted-string)) + (?\[ (ew-add-open res r p 'ew:raw-dl-begin-tok) + (setq mode 'domain-literal)) + (?\( (ew-add-open res r p 'ew:raw-cm-begin-tok) + (setq mode 'comment + nest 1)) + (() (ew-add-frag res r q 'ew:raw-err-tok) (setq p q)))) + ((eq mode 'quoted-string) + (,scan + str p q + (?\" (ew-add-close-token res r p 'ew:raw-qs-end-tok) + (setq mode 'token)) + ((?\\ ?\r ?\n [" \t"]) + (ew-add-frag res r p 'ew:raw-qs-qfold-tok)) + ((?\\ ?\r ?\n [^ " \t"]) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + (((* [^ "\"\\ \t\r"]) (* (+ ?\r) [^ "\"\\ \t\r\n"] (* [^ "\"\\ \t\r"])) (* ?\r) + (?\r ?\n [" \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-qs-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r p 'ew:raw-qs-fold-tok)) + (((* [^ "\"\\ \t\r"]) (* (+ ?\r) [^ "\"\\ \t\r\n"] (* [^ "\"\\ \t\r"])) (* ?\r) + (?\r ?\n [^ " \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-qs-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + ((?\\ (any)) + (ew-add-frag res r p 'ew:raw-qs-qpair-tok)) + ([" \t"] + (ew-add-frag res r p 'ew:raw-qs-wsp-tok)) + (((* [^ "\"\\ \t\r"]) (* (+ ?\r) [^ "\"\\ \t\r\n"] (* [^ "\"\\ \t\r"])) (* ?\r)) + (if (< r p) + (ew-add-frag res r p 'ew:raw-qs-texts-tok) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok))))) + ((eq mode 'domain-literal) + (,scan + str p q + (?\] (ew-add-close-token res r p 'ew:raw-dl-end-tok) + (setq mode 'token)) + ((?\\ ?\r ?\n [" \t"]) + (ew-add-frag res r p 'ew:raw-dl-qfold-tok)) + ((?\\ ?\r ?\n [^ " \t"]) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + (((* [^ "[]\\ \t\r"]) (* (+ ?\r) [^ "[]\\ \t\r\n"] (* [^ "[]\\ \t\r"])) (* ?\r) + (?\r ?\n [" \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-dl-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r p 'ew:raw-dl-fold-tok)) + (((* [^ "[]\\ \t\r"]) (* (+ ?\r) [^ "[]\\ \t\r\n"] (* [^ "[]\\ \t\r"])) (* ?\r) + (?\r ?\n [^ " \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-dl-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + ((?\\ (any)) + (ew-add-frag res r p 'ew:raw-dl-qpair-tok)) + ([" \t"] + (ew-add-frag res r p 'ew:raw-dl-wsp-tok)) + (((* [^ "[]\\ \t\r"]) (* (+ ?\r) [^ "[]\\ \t\r\n"] (* [^ "[]\\ \t\r"])) (* ?\r)) + (if (< r p) + (ew-add-frag res r p 'ew:raw-dl-texts-tok) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok))))) + ((eq mode 'comment) + (,scan + str p q + (?\( (ew-add-open res r p 'ew:raw-cm-nested-begin-tok) + (setq nest (1+ nest))) + (?\) (setq nest (1- nest)) + (if (zerop nest) + (progn + (ew-add-close res r p 'ew:raw-cm-end-tok) + (setq mode 'token)) + (ew-add-close res r p 'ew:raw-cm-nested-end-tok))) + ((?\\ ?\r ?\n [" \t"]) + (ew-add-frag res r p 'ew:raw-cm-qfold-tok)) + ((?\\ ?\r ?\n [^ " \t"]) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + (((* [^ "()\\ \t\r"]) (* (+ ?\r) [^ "()\\ \t\r\n"] (* [^ "()\\ \t\r"])) (* ?\r) + (?\r ?\n [" \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-cm-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r p 'ew:raw-cm-fold-tok)) + (((* [^ "()\\ \t\r"]) (* (+ ?\r) [^ "()\\ \t\r\n"] (* [^ "()\\ \t\r"])) (* ?\r) + (?\r ?\n [^ " \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-cm-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + ((?\\ (any)) + (ew-add-frag res r p 'ew:raw-cm-qpair-tok)) + ([" \t"] + (ew-add-frag res r p 'ew:raw-cm-wsp-tok)) + (((* [^ "()\\ \t\r"]) (* (+ ?\r) [^ "()\\ \t\r\n"] (* [^ "()\\ \t\r"])) (* ?\r)) + (if (< r p) + (ew-add-frag res r p 'ew:raw-cm-texts-tok) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok))))))) + (ew-terminate res) + res)) + +(defun ew-scan-unibyte-mime (col str) + (ew-scan-mime lex-scan-unibyte col str)) +(defun ew-scan-multibyte-mime (col str) + (ew-scan-mime lex-scan-multibyte col str)) + +'( +(npp + (mapcar + (lambda (frag) (cons (get frag 'type) (symbol-name frag))) + (ew-frag-list + (ew-scan-unibyte-mime + 0 " text/vnd.latex-z; charset=ISO-2022-JP")))) +) diff --git a/ew-scan-s.el b/ew-scan-s.el new file mode 100644 index 0000000..5673e77 --- /dev/null +++ b/ew-scan-s.el @@ -0,0 +1,157 @@ +(require 'lex) +(require 'automata) +(require 'ew-data) +(require 'ew-parse) +(provide 'ew-scan-s) + +(defmacro ew-scan-std11 (scan col str) + `(let ((res (ew-make-anchor col str)) + (mode 'token) + (p 0) + (q (length str)) + r + nest) + (while (< p q) + (setq r p) + (cond + ((eq mode 'token) + (,scan + str p q + ([" \t"] (ew-add-frag res r p 'ew:raw-wsp-tok)) + (?< (ew-add-token res r p 'ew:raw-lt-tok)) + (?> (ew-add-token res r p 'ew:raw-gt-tok)) + (?@ (ew-add-token res r p 'ew:raw-at-tok)) + (?, (ew-add-token res r p 'ew:raw-comma-tok)) + (?\; (ew-add-token res r p 'ew:raw-semicolon-tok)) + (?: (ew-add-token res r p 'ew:raw-colon-tok)) + (?. (ew-add-token res r p 'ew:raw-dot-tok)) + ((?\r ?\n [" \t"]) + (ew-add-frag res r p 'ew:raw-fold-tok)) + ((?\r ?\n [^ " \t"]) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + ((+ [(?a ?z) (?A ?Z) (?0 ?9) "!#$%&'*+-/=?^_`{|}~" non-ascii]) + (ew-add-token res r p 'ew:raw-atom-tok)) + (?\" (ew-add-open res r p 'ew:raw-qs-begin-tok) + (setq mode 'quoted-string)) + (?\[ (ew-add-open res r p 'ew:raw-dl-begin-tok) + (setq mode 'domain-literal)) + (?\( (ew-add-open res r p 'ew:raw-cm-begin-tok) + (setq mode 'comment + nest 1)) + (() (ew-add-frag res r q 'ew:raw-err-tok) (setq p q)))) + ((eq mode 'quoted-string) + (,scan + str p q + (?\" (ew-add-close-token res r p 'ew:raw-qs-end-tok) + (setq mode 'token)) + ((?\\ ?\r ?\n [" \t"]) + (ew-add-frag res r p 'ew:raw-qs-qfold-tok)) + ((?\\ ?\r ?\n [^ " \t"]) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + (((* [^ "\"\\ \t\r"]) (* (+ ?\r) [^ "\"\\ \t\r\n"] (* [^ "\"\\ \t\r"])) (* ?\r) + (?\r ?\n [" \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-qs-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r p 'ew:raw-qs-fold-tok)) + (((* [^ "\"\\ \t\r"]) (* (+ ?\r) [^ "\"\\ \t\r\n"] (* [^ "\"\\ \t\r"])) (* ?\r) + (?\r ?\n [^ " \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-qs-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + ((?\\ (any)) + (ew-add-frag res r p 'ew:raw-qs-qpair-tok)) + ([" \t"] + (ew-add-frag res r p 'ew:raw-qs-wsp-tok)) + (((* [^ "\"\\ \t\r"]) (* (+ ?\r) [^ "\"\\ \t\r\n"] (* [^ "\"\\ \t\r"])) (* ?\r)) + (if (< r p) + (ew-add-frag res r p 'ew:raw-qs-texts-tok) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok))))) + ((eq mode 'domain-literal) + (,scan + str p q + (?\] (ew-add-close-token res r p 'ew:raw-dl-end-tok) + (setq mode 'token)) + ((?\\ ?\r ?\n [" \t"]) + (ew-add-frag res r p 'ew:raw-dl-qfold-tok)) + ((?\\ ?\r ?\n [^ " \t"]) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + (((* [^ "[]\\ \t\r"]) (* (+ ?\r) [^ "[]\\ \t\r\n"] (* [^ "[]\\ \t\r"])) (* ?\r) + (?\r ?\n [" \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-dl-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r p 'ew:raw-dl-fold-tok)) + (((* [^ "[]\\ \t\r"]) (* (+ ?\r) [^ "[]\\ \t\r\n"] (* [^ "[]\\ \t\r"])) (* ?\r) + (?\r ?\n [^ " \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-dl-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + ((?\\ (any)) + (ew-add-frag res r p 'ew:raw-dl-qpair-tok)) + ([" \t"] + (ew-add-frag res r p 'ew:raw-dl-wsp-tok)) + (((* [^ "[]\\ \t\r"]) (* (+ ?\r) [^ "[]\\ \t\r\n"] (* [^ "[]\\ \t\r"])) (* ?\r)) + (if (< r p) + (ew-add-frag res r p 'ew:raw-dl-texts-tok) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok))))) + ((eq mode 'comment) + (,scan + str p q + (?\( (ew-add-open res r p 'ew:raw-cm-nested-begin-tok) + (setq nest (1+ nest))) + (?\) (setq nest (1- nest)) + (if (zerop nest) + (progn + (ew-add-close res r p 'ew:raw-cm-end-tok) + (setq mode 'token)) + (ew-add-close res r p 'ew:raw-cm-nested-end-tok))) + ((?\\ ?\r ?\n [" \t"]) + (ew-add-frag res r p 'ew:raw-cm-qfold-tok)) + ((?\\ ?\r ?\n [^ " \t"]) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + (((* [^ "()\\ \t\r"]) (* (+ ?\r) [^ "()\\ \t\r\n"] (* [^ "()\\ \t\r"])) (* ?\r) + (?\r ?\n [" \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-cm-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r p 'ew:raw-cm-fold-tok)) + (((* [^ "()\\ \t\r"]) (* (+ ?\r) [^ "()\\ \t\r\n"] (* [^ "()\\ \t\r"])) (* ?\r) + (?\r ?\n [^ " \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-cm-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + ((?\\ (any)) + (ew-add-frag res r p 'ew:raw-cm-qpair-tok)) + ([" \t"] + (ew-add-frag res r p 'ew:raw-cm-wsp-tok)) + (((* [^ "()\\ \t\r"]) (* (+ ?\r) [^ "()\\ \t\r\n"] (* [^ "()\\ \t\r"])) (* ?\r)) + (if (< r p) + (ew-add-frag res r p 'ew:raw-cm-texts-tok) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok))))))) + (ew-terminate res) + res)) + +(defun ew-scan-unibyte-std11 (col str) + (ew-scan-std11 lex-scan-unibyte col str)) +(defun ew-scan-multibyte-std11 (col str) + (ew-scan-std11 lex-scan-multibyte col str)) + +'( +(npp + (mapcar + 'symbol-plist + (ew-frag-list + (ew-scan-unibyte-std11 + 0 " Tanaka Akira (Tanaka Akira)")))) + +(npp + (mapcar + (lambda (frag) (cons (get frag 'type) (symbol-name frag))) + (ew-frag-list + (ew-scan-unibyte-std11 + 0 " Tanaka Akira (Tanaka Akira)")))) +) diff --git a/ew-scan-u.el b/ew-scan-u.el new file mode 100644 index 0000000..ebbac5d --- /dev/null +++ b/ew-scan-u.el @@ -0,0 +1,56 @@ +(require 'lex) +(require 'automata) +(require 'ew-data) +(require 'ew-parse) +(provide 'ew-scan-u) + +(defmacro ew-scan-unstructured (scan col str) + `(let ((res (ew-make-anchor col str)) + (p 0) + (q (length str)) + r) + (while (< p q) + (setq r p) + (,scan + str p q + ([" \t"] (ew-add-frag res r p 'ew:raw-us-wsp-tok)) + (((* [^ " \t\r"]) (* (+ ?\r) [^ " \t\r\n"] (* [^ " \t\r"])) (* ?\r) + (?\r ?\n [" \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-us-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r p 'ew:raw-us-fold-tok)) + (((* [^ " \t\r"]) (* (+ ?\r) [^ " \t\r\n"] (* [^ " \t\r"])) (* ?\r) + (?\r ?\n [^ " \t"])) + (when (< r (- p 3)) + (ew-add-frag res r (- p 3) 'ew:raw-us-texts-tok) + (setq r (- p 3))) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok)) + (((* [^ " \t\r"]) (* (+ ?\r) [^ " \t\r\n"] (* [^ " \t\r"])) (* ?\r)) + (if (< r p) + (ew-add-frag res r p 'ew:raw-us-texts-tok) + (ew-add-frag res r (setq p q) 'ew:raw-err-tok))))) + (ew-terminate res) + res)) + +(defun ew-scan-unibyte-unstructured (col str) + (ew-scan-unstructured lex-scan-unibyte col str)) +(defun ew-scan-multibyte-unstructured (col str) + (ew-scan-unstructured lex-scan-multibyte col str)) + +'( +(npp + (mapcar + (lambda (frag) (cons (get frag 'type) (symbol-name frag))) + (ew-frag-list + (ew-scan-unibyte-unstructured + 0 " Hello! =?US-ASCII?Q?Hello!?=")))) + +(npp + (mapcar + (lambda (frag) (cons (get frag 'type) (symbol-name frag))) + (ew-frag-list + (ew-scan-unibyte-unstructured + 0 " \r\na")))) + +) diff --git a/ew-unit.el b/ew-unit.el new file mode 100644 index 0000000..74b379f --- /dev/null +++ b/ew-unit.el @@ -0,0 +1,90 @@ +(require 'closure) +(require 'ew-line) +(require 'ew-quote) +(require 'mel) + +(provide 'ew-unit) + +(defconst ew-anchored-encoded-word-regexp + (concat "\\`" ew-encoded-word-regexp "\\'")) + +(defconst ew-b-regexp + (concat "\\`\\(" + "[A-Za-z0-9+/]" + "[A-Za-z0-9+/]" + "[A-Za-z0-9+/]" + "[A-Za-z0-9+/]" + "\\)*" + "[A-Za-z0-9+/]" + "[A-Za-z0-9+/]" + "\\(==\\|" + "[A-Za-z0-9+/]" + "[A-Za-z0-9+/=]" + "\\)\\'")) + +(defconst ew-q-regexp "\\`\\([^=?]\\|=[0-9A-Fa-f][0-9A-Fa-f]\\)*\\'") + +(defconst ew-byte-decoder-alist + '(("B" . ew-b-decode) + ("Q" . ew-q-decode))) + +(defconst ew-byte-checker-alist + '(("B" . ew-b-check) + ("Q" . ew-q-check))) + +(defun ew-b-check (encoding encoded-text) (string-match ew-b-regexp encoded-text)) +(defun ew-q-check (encoding encoded-text) (string-match ew-q-regexp encoded-text)) + +(defun ew-eword-p (str) + (let ((len (length str))) + (and + (<= 3 len) + (string= (substring str 0 2) "=?") + (string= (substring str (- len 2) len) "?=")))) + +(defun ew-decode-eword (str &optional eword-filter1 eword-filter2) + (if (string-match ew-anchored-encoded-word-regexp str) + (let ((charset (match-string 1 str)) + (encoding (match-string 2 str)) + (encoded-text (match-string 3 str)) + bdec cdec + bcheck + tmp) + (if (and (setq bdec (ew-byte-decoder encoding)) + (setq cdec (ew-char-decoder charset))) + (if (or (null (setq bcheck (ew-byte-checker encoding))) + (funcall bcheck encoding encoded-text)) + (progn + (setq tmp (closure-call cdec (funcall bdec encoded-text))) + (when eword-filter1 (setq tmp (closure-call eword-filter1 tmp))) + (setq tmp (ew-quote tmp)) + (when eword-filter2 (setq tmp (closure-call eword-filter2 tmp))) + tmp) + (ew-quote str)) + (ew-quote-eword charset encoding encoded-text))) + (ew-quote str))) + +(defun ew-byte-decoder (encoding) + (cdr (assoc (upcase encoding) ew-byte-decoder-alist))) + +(defun ew-byte-checker (encoding) + (cdr (assoc (upcase encoding) ew-byte-checker-alist))) + +(defalias 'ew-b-decode 'base64-decode-string) +(defalias 'ew-q-decode 'q-encoding-decode-string) + +(defconst ew-charset-aliases + '((us-ascii . iso-8859-1) + (iso-2022-jp-2 . iso-2022-7bit-ss2))) +(defun ew-char-decoder (charset) + (catch 'return + (setq charset (downcase charset)) + (let ((sym (intern charset)) + tmp cs) + (when (setq tmp (assq sym ew-charset-aliases)) + (setq sym (cdr tmp))) + (setq cs (intern (concat (symbol-name sym) "-unix"))) + (when (coding-system-p cs) + (throw 'return + (closure-make (lambda (str) (decode-coding-string str cs)) cs))) + nil))) diff --git a/ew-util.el b/ew-util.el new file mode 100644 index 0000000..fb1b373 --- /dev/null +++ b/ew-util.el @@ -0,0 +1,19 @@ +(provide 'ew-util) + +(defmacro ew-cons* (seed &rest rest) + (setq rest (nreverse (cons seed rest)) + seed (car rest) + rest (cdr rest)) + (while rest + (setq seed `(cons ,(car rest) ,seed) + rest (cdr rest))) + seed) + +(defmacro ew-rcons* (seed &rest rest) + (while rest + (setq seed `(cons ,(car rest) ,seed) + rest (cdr rest))) + seed) + +(defmacro ew-rappend (a b) + `(append (reverse ,b) ,a)) diff --git a/eword-decode.el b/eword-decode.el index bee20c3..affdce6 100644 --- a/eword-decode.el +++ b/eword-decode.el @@ -36,10 +36,21 @@ (require 'mel) (require 'mime-def) +(require 'ew-dec) + (defgroup eword-decode nil "Encoded-word decoding" :group 'mime) +;;; TEST + +(defmacro rotate-memo (var val) + `(progn + (unless (boundp ',var) (setq ,var ())) + (setq ,var (cons ,val ,var)) + (let ((tmp (last ,var (- (length ,var) 100)))) + (when tmp (setcdr tmp nil))) + ,var)) ;;; @ variables ;;; @@ -397,6 +408,10 @@ Otherwise it decodes non-ASCII bit patterns as the default-mime-charset. If SEPARATOR is not nil, it is used as header separator." (interactive "*") + (rotate-memo args-eword-decode-header + (list code-conversion)) + (unless code-conversion + (message "eword-decode-header is called with no code-conversion")) (if (and code-conversion (not (mime-charset-to-coding-system code-conversion))) (setq code-conversion default-mime-charset)) @@ -404,34 +419,17 @@ If SEPARATOR is not nil, it is used as header separator." (save-restriction (std11-narrow-to-header separator) (if code-conversion - (let (beg p end field-name len) + (let (beg p end field-name field-body len) (goto-char (point-min)) (while (re-search-forward std11-field-head-regexp nil t) (setq beg (match-beginning 0) p (match-end 0) field-name (buffer-substring beg (1- p)) - len (string-width field-name) - field-name (intern (capitalize field-name)) - end (std11-field-end)) - (cond ((memq field-name eword-decode-ignored-field-list) - ;; Don't decode - ) - ((memq field-name eword-decode-structured-field-list) - ;; Decode as structured field - (let ((body (buffer-substring p end))) - (delete-region p end) - (insert (eword-decode-and-fold-structured-field - body (1+ len))) - )) - (t - ;; Decode as unstructured field - (save-restriction - (narrow-to-region beg (1+ end)) - (goto-char p) - (eword-decode-region beg (point-max) 'unfold nil - code-conversion) - (goto-char (point-max)) - ))))) + end (std11-field-end) + field-body (buffer-substring p end)) + (delete-region p end) + (insert (ew-decode-field field-name (ew-lf-crlf-to-crlf field-body))) + )) (eword-decode-region (point-min) (point-max) t nil nil) )))) @@ -745,6 +743,8 @@ characters are regarded as variable `default-mime-charset'. If an encoded-word is broken or your emacs implementation can not decode the charset included in it, it is not decoded." + (rotate-memo args-eword-decode-and-unfold-structured-field + (list string)) (let ((tokens (eword-lexical-analyze string 'must-unfold)) (result "")) (while tokens @@ -773,15 +773,21 @@ decode the charset included in it, it is not decoded. If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even if there are in decoded encoded-words (generated by bad manner MUA such as a version of Net$cape)." + (rotate-memo args-eword-decode-structured-field-body + (list string must-unfold start-column max-column)) (if start-column - ;; fold with max-column - (eword-decode-and-fold-structured-field - string start-column max-column must-unfold) + ;; fold with max-column (folding is not implemented.) + (let* ((ew-decode-field-default-syntax '(ew-scan-unibyte-std11)) + (decoded (ew-decode-field (make-string (1- start-column) ?X) + (ew-lf-crlf-to-crlf string) + (if must-unfold 'ew-cut-cr-lf)))) + (if must-unfold (ew-cut-cr-lf decoded) decoded)) ;; Don't fold - (mapconcat (function eword-decode-token) - (eword-lexical-analyze string must-unfold) - "") - )) + (let* ((ew-decode-field-default-syntax '(ew-scan-unibyte-std11)) + (decoded (ew-decode-field "" + (ew-lf-crlf-to-crlf string) + (if must-unfold 'ew-cut-cr-lf)))) + (if must-unfold (ew-cut-cr-lf decoded) decoded)))) (defun eword-decode-unstructured-field-body (string &optional must-unfold) "Decode non us-ascii characters in STRING as unstructured field body. @@ -797,7 +803,14 @@ decode the charset included in it, it is not decoded. If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even if there are in decoded encoded-words (generated by bad manner MUA such as a version of Net$cape)." - (eword-decode-string string must-unfold default-mime-charset)) + (rotate-memo args-eword-decode-unstructured-field-body + (list string must-unfold)) + (let ((decoded (ew-decode-field "" + (ew-lf-crlf-to-crlf string) + (if must-unfold 'ew-cut-cr-lf)))) + (if must-unfold + (ew-cut-cr-lf decoded) + decoded))) (defun eword-extract-address-components (string) "Extract full name and canonical address from STRING. diff --git a/lex.el b/lex.el new file mode 100644 index 0000000..cd0040d --- /dev/null +++ b/lex.el @@ -0,0 +1,196 @@ +(require 'emu) +(require 'rx) +(require 'automata) +(provide 'lex) + +(put 'lex-scan-multibyte 'lisp-indent-function 3) +(put 'lex-scan-unibyte 'lisp-indent-function 3) + +;;; automata generation + +(defun lex-automata (rx) + (let* ((rx (rx-simplify rx)) + (stack (list rx)) ; list of rx + (table (list (rx-cons* rx 0 (lex-make-box (list 'd1 'd2))))) + ; list of (rx id . box-for-reverse-links) + (states ()) ; list of (id act trans . box-for-reverse-links) + ; where trans = list of (pc id . box-for-reverse-links) + (next-id 1) + tbl-ent box id pcs act pc trans rx-stepped p) + (while (consp stack) + (setq rx (car stack) + stack (cdr stack) + tbl-ent (assoc rx table) + id (cadr tbl-ent) + box (cddr tbl-ent) + pcs (rx-head-pcs rx) + act (rx-head-act rx) + trans ()) + (while (consp pcs) + (setq pc (car pcs) + pcs (cdr pcs) + rx-stepped (rx-step rx pc) + p (assoc rx-stepped table)) + (if p + (progn + (setq trans (cons (cons pc (cdr p)) trans)) + (lex-add-box (cddr p) id)) + (setq p (rx-cons* rx-stepped next-id (lex-make-box (list id))) + trans (cons (cons pc (cdr p)) trans) + table (cons p table) + next-id (1+ next-id) + stack (cons rx-stepped stack)))) + (setq states + (cons (rx-cons* id act trans box) + states))) + states)) + +;;; automata coding + +(defvar lex-pc-var (make-symbol "pc")) +(defvar lex-act-var (make-symbol "act")) +(defvar lex-escape-tag (make-symbol "esc")) + +(defun lex-gen-machine (states cs acts read-macro save-macro) + `(let (,lex-pc-var ,lex-act-var) + (catch ',lex-escape-tag + (automata + ,lex-pc-var 0 + ,@(mapcar + (lambda (s) (lex-gen-state s cs read-macro save-macro)) + states))) + (automata-branch + ,lex-act-var ,(apply 'natset-single (mapcar 'car acts)) automata-never-fail + ,@(mapcar + (lambda (act) `(,(natset-single (car act)) nil ,@(cdr act))) + acts)))) + +(defun lex-gen-state (s cs read-macro save-macro) + (let ((id (nth 0 s)) + (act (nth 1 s)) + (trans (nth 2 s))) + `(,id + (progn + ,@(if act + `((lex-match ,(cdr act)) (,save-macro)) + ()) + ,@(if (consp trans) `((,read-macro ,lex-pc-var)))) + (lex-fail) + ,@(mapcar + (lambda (tr) `(,(let ((l (member (car tr) cs))) + (if (null (cdr l)) + (natset-seg (car l)) + (natset-seg (car l) (1- (cadr l))))) + ,(cadr tr))) + trans)))) + +;;; internal macros + +(defmacro lex-match (id) + `(setq ,lex-act-var ',id)) +(defmacro lex-fail () + `(throw ',lex-escape-tag nil)) + +;;; user interface macro + +;;; multibyte + +(defvar lex-scan-multibyte-str-var (make-symbol "str")) +(defvar lex-scan-multibyte-ptr-var (make-symbol "ptr")) +(defvar lex-scan-multibyte-end-var (make-symbol "end")) +(defvar lex-scan-multibyte-mch-var (make-symbol "mch")) + +(defmacro lex-scan-multibyte-read (pc) + `(if (< ,lex-scan-multibyte-ptr-var ,lex-scan-multibyte-end-var) + (setq ,pc (sref ,lex-scan-multibyte-str-var ,lex-scan-multibyte-ptr-var) + ,lex-scan-multibyte-ptr-var (char-next-index ,pc ,lex-scan-multibyte-ptr-var) + ,pc (char-int ,pc)) + (lex-fail))) + +(defmacro lex-scan-multibyte-save () + `(setq ,lex-scan-multibyte-mch-var ,lex-scan-multibyte-ptr-var)) + +(defmacro lex-scan-multibyte (str start end &rest clauses) + (if (not start) (setq start 0)) + (if (not end) (setq end `(length ,lex-scan-multibyte-str-var))) + (let ((id 1) (rx ()) (acts ()) tmp code + (restore-code (if (symbolp start) `(setq ,start ,lex-scan-multibyte-mch-var)))) + (while (consp clauses) + (setq rx (cons (rx-con (caar clauses) (rx-act id)) rx) + acts (cons (cons id (cons restore-code (cdar clauses))) acts) + id (1+ id) + clauses (cdr clauses))) + (setq rx (rx-alt rx) + tmp (rx-categolize-char (rx-desugar rx))) + `(let* ((,lex-scan-multibyte-str-var ,str) + (,lex-scan-multibyte-ptr-var ,start) + (,lex-scan-multibyte-end-var ,end) + ,lex-scan-multibyte-mch-var) + ,(lex-gen-machine (lex-automata (car tmp)) (cdr tmp) acts 'lex-scan-multibyte-read 'lex-scan-multibyte-save)))) + +;;; unibyte + +(defvar lex-scan-unibyte-str-var (make-symbol "str")) +(defvar lex-scan-unibyte-ptr-var (make-symbol "ptr")) +(defvar lex-scan-unibyte-end-var (make-symbol "end")) +(defvar lex-scan-unibyte-mch-var (make-symbol "mch")) + +(defmacro lex-scan-unibyte-read (pc) + `(if (< ,lex-scan-unibyte-ptr-var ,lex-scan-unibyte-end-var) + (setq ,pc (aref ,lex-scan-unibyte-str-var ,lex-scan-unibyte-ptr-var) + ,lex-scan-unibyte-ptr-var (1+ ,lex-scan-unibyte-ptr-var) + ,pc (char-int ,pc)) + (lex-fail))) + +(defmacro lex-scan-unibyte-save () + `(setq ,lex-scan-unibyte-mch-var ,lex-scan-unibyte-ptr-var)) + +(defmacro lex-scan-unibyte (str start end &rest clauses) + (if (not start) (setq start 0)) + (if (not end) (setq end `(length ,lex-scan-unibyte-str-var))) + (let ((id 1) (rx ()) (acts ()) tmp code + (restore-code (if (symbolp start) `(setq ,start ,lex-scan-unibyte-mch-var)))) + (while (consp clauses) + (setq rx (cons (rx-con (caar clauses) (rx-act id)) rx) + acts (cons (cons id (cons restore-code (cdar clauses))) acts) + id (1+ id) + clauses (cdr clauses))) + (setq rx (rx-alt rx) + tmp (rx-categolize-char (rx-desugar rx))) + `(let* ((,lex-scan-unibyte-str-var ,str) + (,lex-scan-unibyte-ptr-var ,start) + (,lex-scan-unibyte-end-var ,end) + ,lex-scan-unibyte-mch-var) + ,(lex-gen-machine (lex-automata (car tmp)) (cdr tmp) acts 'lex-scan-unibyte-read 'lex-scan-unibyte-save)))) + +;;; utilities + +(defun lex-make-box (val) + (list val)) +(defalias 'lex-box-ref 'car) + +(defun lex-add-box (box val) + (if (not (member val (car box))) + (setcar box (cons val (car box))))) + +;;; testing +'( + + (mapcar (lambda (v) (set v (intern (symbol-name (symbol-value v))))) + '(lex-pc-var + lex-act-var + lex-escape-tag + lex-scan-multibyte-str-var + lex-scan-multibyte-ptr-var + lex-scan-multibyte-end-var + lex-scan-multibyte-mch-var + lex-scan-unibyte-str-var + lex-scan-unibyte-ptr-var + lex-scan-unibyte-end-var + lex-scan-unibyte-mch-var)) + + (lex-scan-multibyte + "aaa" 0 3 + (?a 'a)) + +) diff --git a/lr-driver.el b/lr-driver.el new file mode 100644 index 0000000..7c14b19 --- /dev/null +++ b/lr-driver.el @@ -0,0 +1,72 @@ +;; ---------------------------------------------------------------------- ;; +;; FICHIER : lr-dvr.scm ;; +;; DATE DE CREATION : Fri May 31 15:47:05 1996 ;; +;; DERNIERE MODIFICATION : Fri May 31 15:51:13 1996 ;; +;; ---------------------------------------------------------------------- ;; +;; Copyright (c) 1996 Dominique Boucher ;; +;; ---------------------------------------------------------------------- ;; +;; The LR parser driver ;; +;; ;; +;; lr-dvr.scm is part of the lalr.scm distribution which is free ;; +;; software; you can redistribute it and/or modify ;; +;; it under the terms of the GNU General Public License as published by ;; +;; the Free Software Foundation; either version 2, or (at your option) ;; +;; any later version. ;; +;; ;; +;; lalr.scm is distributed in the hope that it will be useful, ;; +;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; +;; GNU General Public License for more details. ;; +;; ;; +;; You should have received a copy of the GNU General Public License ;; +;; along with lalr.scm; see the file COPYING. If not, write to ;; +;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. ;; +;; ---------------------------------------------------------------------- ;; + +;; 1998/08/16: Tanaka Akira transplants from Scheme to Emacs-Lisp. + +(provide 'lr-driver) + +(defconst lr-max-stack-size 500) + +(defun lr-push (stack sp new-cat goto-table lval) + (let* ((state (aref stack sp)) + (new-state (cdr (assq new-cat (aref goto-table state)))) + (new-sp (+ sp 2))) + (if (>= new-sp lr-max-stack-size) + (error "PARSE ERROR : stack overflow") + (progn + (aset stack new-sp new-state) + (aset stack (- new-sp 1) lval) + new-sp)))) + +(defun lr-parse (lexerp errorp action-table goto-table reduction-table token-defs) + (let ((stack (make-vector lr-max-stack-size 0)) (sp 0) (input (funcall lexerp))) + (catch 'parser + (while t + (let* ((state (aref stack sp)) + (i (car input)) + (act (let* ((l (aref action-table state)) (y (assq i l))) (if y (cdr y) (cdar l))))) + + (cond + + ;; Input succesfully parsed + ((eq act 'accept) + (throw 'parser (aref stack 1))) + + ;; Syntax error in input + ((eq act '*error*) + (throw 'parser + (funcall errorp "PARSE ERROR : unexpected token : " + (cdr (assq i token-defs))))) + + ;; Shift current token on top of the stack + ((>= act 0) + (aset stack (+ sp 1) (cdr input)) + (aset stack (+ sp 2) act) + (setq sp (+ sp 2)) + (setq input (funcall lexerp))) + + ;; Reduce by rule (- act) + (t + (setq sp (funcall (aref reduction-table (- act)) stack sp goto-table))))))))) diff --git a/natset.el b/natset.el new file mode 100644 index 0000000..8887ea5 --- /dev/null +++ b/natset.el @@ -0,0 +1,242 @@ +(provide 'natset) + +;;; pacage for set of natural number. +;; (natural number includes zero.) + +;;; predicates + +(defun natset-empty-p (ns) + "Returns t if NS is empty." + (equal ns ())) + +(defun natset-full-p (ns) + "Returns t if NS is full." + (equal ns '(0))) + +(defun natset-closed-p (ns) + "Returns t if NS is closed." + (= (logand (length ns) 1) 0)) + +(defun natset-open-p (ns) + "Returns t if NS is open." + (= (logand (length ns) 1) 1)) + +(defun natset-has-p (ns i) + "Returns t if I is in NS." + (not (natset-empty-p (natset-intersection (natset-single i) ns)))) + +(defun natset-has-intersection-p (ns1 ns2) + "Returns t if the intersection of NS1 and NS2 is not empty." + (not (natset-empty-p (natset-intersection ns1 ns2)))) + +(defun natset-include-p (ns1 ns2) + "Returns t if NS1 includes NS2." + (equal ns1 (natset-union ns1 ns2))) + +;;; accessor + +(defun natset-start (ns) + "Returns start element in NS." + (if (natset-empty-p ns) + (error "natset empty" ns)) + (car ns)) + +;;; primitive constructor + +(defun natset-empty () + "Returns a empty set. +{}" + ()) + +(defun natset-full () + "Returns a full set. +{i | 0 <= i}" + '(0)) + +(defun natset-single (&rest elts) + "Returns a set contains singleton elements. +{i | i in ELTS}" + (let ((ns (natset-empty))) + (while elts + (setq ns (natset-union ns (natset-seg (car elts) (car elts))) + elts (cdr elts))) + ns)) + +(defun natset-seg (start &optional end) + "Returns a set contains one segment. +{i | START <= i and i <= END} + +If END is nil, Return the set {i | START <= i}" + (if end + (list start (1+ end)) + (list start))) + +;;; complex constructor + +(defun natset-start-set (ns) + "Returns a set contains start boundaries for NS. +{i | NS does not contains i-1 and NS contains i}" + (let ((res ())) + (while ns + (setq res (cons (1+ (car ns)) (cons (car ns) res)) + ns (cddr ns))) + (nreverse res))) + +(defun natset-end-set (ns) + "Returns a set contains end boundaries for NS. +{i | NS contains i-1 and NS does not contains i}" + (let ((res ())) + (setq ns (cdr ns)) + (while ns + (setq res (cons (1+ (car ns)) (cons (car ns) res)) + ns (cddr ns))) + (nreverse res))) + +(defun natset-boundary-set (ns) + "Returns a set contains start and end boundaries for NS. +{i | NS contains i-1 xor NS does not contains i}" + (natset-union (natset-start-set ns) (natset-end-set ns))) + +(defun natset-minmax (ns) + "Returns a set contains a range from minimum to maximam of NS. +{i | There exists j, k in NS, j <= i <= k}" + (cond + ((null ns) ()) + ((natset-open-p ns) (list (car ns))) + (t + (list (car ns) (nth (1- (length ns)) ns))))) + +;;; set operation + +(defun natset-negate (ns) + "Returns negated set. +{i | 0 <= i and NS does not contains i}" + (if (and (consp ns) (= (car ns) 0)) + (cdr ns) + (cons 0 ns))) + +(defun natset-union (&rest nss) + "Returns unioned set. +{i | There exists ns in NSS s.t ns contains i}" + (let ((ns (natset-empty))) + (while nss + (setq ns (natset-union2 ns (car nss)) + nss (cdr nss))) + ns)) + +(defun natset-intersection (&rest nss) + "Returns intersectioned set. +{i | For all ns in NSS, ns contains i}" + (natset-negate (apply 'natset-union (mapcar 'natset-negate nss)))) + +(defun natset-sub (ns &rest nss) + "Returns subtracted set. +{i | NS contains i and for all ns in NSS, ns does not contains i}" + (setq ns (natset-intersection ns (natset-negate (apply 'natset-union nss))))) + +;;; enumeration + +(defun natset-enum (ns) + (if (natset-open-p ns) + (error "natset open" ns)) + (let ((res ()) i j) + (while ns + (setq i (car ns) + j (cadr ns) + ns (cddr ns)) + (while (< i j) + (setq res (cons i res) + i (1+ i)))) + (nreverse res))) + +;;; code generation + +(defun natset-take-seg (ns) + (cond + ((null ns) (error "NS empty" ns)) + ((null (cdr ns)) (cons ns ())) + (t (cons (list (car ns) (cadr ns)) (cddr ns))))) + +(defun natset-valid-filter (ns valid) + "Returns a filtered set R. +R includes intersection between VALID and NS. +R does not include intersecton between VALID and negated NS. +Element does not contained in VALID is unspecified." + (let* ((res (natset-intersection valid ns)) + (len (length res)) + (u-set (natset-negate valid)) + tmp1 tmp2 tmpl) + (while u-set + (setq tmp1 (natset-take-seg u-set)) + (setq tmp2 (natset-union (car tmp1) res) + tmpl (length tmp2)) + (if (or (< tmpl len) (and (= tmpl len) (equal 0 (car tmp2)))) + (setq res tmp2 + len (length tmp2))) + (setq u-set (cdr tmp1))) + res)) + +(defun natset-gen-pred-exp (ns var &optional valid) + "Returns a expression to test value of variable VAR is in NS or not. + +If VALID is not nil, the condition value of VAR is in VALID is assumed. +It is impossible to set VALID to empty set because empty set is represented as nil." + (if valid (setq ns (natset-valid-filter ns valid))) + (cond + ((null ns) nil) + ((= (car ns) 0) (natset-gen-pred-exp-internal (cdr ns) var nil 0)) + (t (natset-gen-pred-exp-internal ns var t 0)))) + +;;; internal primitive + +(defun natset-union2 (ns1 ns2) + (let (res start (end t)) + (while (and end (or (consp ns1) (consp ns2))) + (if (and (consp ns1) (or (null ns2) (<= (car ns1) (car ns2)))) + (setq start (car ns1) + end (cadr ns1) + ns1 (cddr ns1)) + (setq start (car ns2) + end (cadr ns2) + ns2 (cddr ns2))) + (while (and end + (or (and (consp ns1) (<= (car ns1) end)) + (and (consp ns2) (<= (car ns2) end)))) + (if (and (consp ns1) (<= (car ns1) end)) + (progn + (if (or (null (cadr ns1)) (< end (cadr ns1))) (setq end (cadr ns1))) + (setq ns1 (cddr ns1))) + (progn + (if (or (null (cadr ns2)) (< end (cadr ns2))) (setq end (cadr ns2))) + (setq ns2 (cddr ns2))))) + (setq res (cons start res)) + (if end (setq res (cons end res)))) + (nreverse res))) + +; n is greater or equal 2. +; returns one of 1 .. n-1 +; (In reality, returns greatest 2^i - 1) +(defun natset-divide (n) + (let ((l 2) tmp) + (while (< (setq tmp (lsh l 1)) n) + (setq l tmp)) + (1- l))) + +(defun natset-gen-pred-exp-internal (ns var bool base) + (cond + ((null ns) (not bool)) + ((null (cdr ns)) + (if (<= (car ns) base) + bool + (if bool `(<= ,(car ns) ,var) `(< ,var ,(car ns))))) + (t + (let* ((div (natset-divide (length ns))) + (l (append ns ())) + (g (nthcdr (1- div) l)) + (m (cadr g)) + ) + (setq g (prog1 (cddr g) (setcdr g ()))) + `(if (< ,var ,m) + ,(natset-gen-pred-exp-internal l var bool base) + ,(natset-gen-pred-exp-internal + g var (if (= (logand div 1) 1) bool (not bool)) m)))))) diff --git a/rx.el b/rx.el new file mode 100644 index 0000000..08c60d0 --- /dev/null +++ b/rx.el @@ -0,0 +1,394 @@ +;;; regular expression + +;;; core +;; rx ::= [] {} +;; | () {""} +;; | (* . rx) closure +;; | (| . rxs) alternative +;; | (rx . rx) concatination +;; | (cc c1 c2 c3 ...) character class (natset) +;;; sugar +;; | (+ . rx) positive closure +;; | "..." string +;; | c character +;; | (non-ascii) (cc 128) +;; | [ range ... ] +;; | [ ^ range ... ] +;;; internal +;; | pc primitive character class +;; | (act . int) action + +;; range ::= c +;; | "..." +;; | (c1 c2) [c1 c2] +;; | (c1 . c2) [c1 c2) +;; | (c) [c1 inf) +;; | non-ascii + +(require 'natset) +(require 'automata) +(provide 'rx) + +(defun rx-empty-p (rx) (equal rx [])) +(defun rx-null-p (rx) (equal rx ())) +(defun rx-act-p (rx) (and (consp rx) (eq (car rx) 'act))) +(defun rx-cc-p (rx) (and (consp rx) (eq (car rx) 'cc))) +(defalias 'rx-pc-p 'integerp) + +(defun rx-clo-p (rx) (and (consp rx) (eq (car rx) '*))) +(defun rx-alt-p (rx) (and (consp rx) (eq (car rx) '|))) +(defun rx-con-p (rx) (and (consp rx) (or (null (car rx)) (not (symbolp (car rx)))))) + +(defun rx-clo (rx) + (cond + ((rx-empty-p rx) ()) + ((rx-null-p rx) rx) + ((rx-act-p rx) rx) + ((rx-clo-p rx) rx) + (t (cons '* rx)))) +(defun rx-alt (rxs) + (cond + ((null rxs) []) + ((null (cdr rxs)) (car rxs)) + (t (cons '| rxs)))) +(defun rx-alt2 (r1 r2) + (cond + ((rx-empty-p r1) r2) + ((rx-empty-p r2) r1) + ((equal r1 r2) r1) + (t (list '| r1 r2)))) +(defun rx-con (r1 r2) + (cond + ((rx-empty-p r1) []) + ((rx-empty-p r2) []) + ((rx-null-p r1) r2) + ((rx-null-p r2) r1) + ((and (rx-act-p r1) (rx-act-p r2)) r2) + (t (cons r1 r2)))) +(defun rx-act (obj) (cons 'act obj)) +(defun rx-cc (cs) (cons 'cc cs)) + +;;; regular expression preprocessing + +(defun rx-range-to-ns (range) + (cond + ((char-or-char-int-p range) + (natset-single (char-int range))) + ((stringp range) + (let ((ns (natset-empty)) (chars (string-to-int-list range))) + (while chars + (setq ns (natset-union ns (natset-single (car chars))) + chars (cdr chars))) + ns)) + ((eq range 'non-ascii) + (natset-seg 128)) + ((and (consp range) + (null (cdr range)) + (char-or-char-int-p (car range))) + (natset-seg (car range))) + ((and (consp range) + (consp (cdr range)) + (null (cddr range)) + (char-or-char-int-p (car range)) + (char-or-char-int-p (cadr range))) + (natset-seg (char-int (car range)) (char-int (cadr range)))) + ((and (consp range) + (char-or-char-int-p (car range)) + (char-or-char-int-p (cdr range))) + (natset-seg (char-int (car range)) (1- (char-int (cdr range))))) + (t (error "not range %s" range)))) + +(defun rx-vcc-to-rx (vcc) + (let ((l (append vcc ())) neg ns) + (if (eq (car l) '^) + (setq l (cdr l) + neg t)) + (setq l (mapcar 'rx-range-to-ns l)) + (setq ns (natset-empty)) + (while l + (setq ns (natset-union ns (car l)) + l (cdr l))) + (if neg (setq ns (natset-negate ns))) + (if (natset-empty-p ns) + [] + (rx-cc ns)))) + +(defun rx-desugar (rx) + (cond + ((stringp rx) (rx-desugar (string-to-int-list rx))) + ((vectorp rx) (rx-vcc-to-rx rx)) + ((char-or-char-int-p rx) (rx-cc (natset-single (char-int rx)))) + ((and (consp rx) (eq (car rx) '+)) (let ((r (rx-desugar (cdr rx)))) (rx-con r (rx-clo r)))) + ((and (consp rx) (eq (car rx) 'non-ascii)) (rx-cc (natset-seg 128))) + ((and (consp rx) (eq (car rx) 'any)) (rx-cc (natset-full))) + ((rx-empty-p rx) rx) + ((rx-null-p rx) rx) + ((rx-act-p rx) rx) + ((rx-cc-p rx) rx) + ((rx-clo-p rx) (rx-clo (rx-desugar (cdr rx)))) + ((rx-alt-p rx) (rx-alt (mapcar 'rx-desugar (cdr rx)))) + ((rx-con-p rx) (rx-con (rx-desugar (car rx)) (rx-desugar (cdr rx)))) + (t (error "not rx %s" rx)))) + +(defun rx-collect-cc (rx &optional cs) + (cond + ((rx-empty-p rx) cs) + ((rx-null-p rx) cs) + ((rx-act-p rx) cs) + ((rx-cc-p rx) (append (cdr rx) cs)) + ((rx-clo-p rx) (rx-collect-cc (cdr rx) cs)) + ((rx-alt-p rx) + (setq rx (cdr rx)) + (while (consp rx) + (setq cs (rx-collect-cc (car rx) cs) + rx (cdr rx))) + cs) + ((rx-con-p rx) (rx-collect-cc (car rx) (rx-collect-cc (cdr rx) cs))) + (t (error "not rx %s" rx)))) + +(defun rx-cc-to-pc (rx cs) + (cond + ((rx-empty-p rx) rx) + ((rx-null-p rx) rx) + ((rx-act-p rx) rx) + ((rx-cc-p rx) + (setq rx (cdr rx)) + (let (res) + (while (and (consp rx) (consp (cdr rx))) + (let ((start (car rx)) (end (cadr rx))) + (setq res (rx-filter (lambda (c) (and (<= start c) (< c end))) cs res) + rx (cddr rx)))) + (if (consp rx) + (let ((start (car rx))) + (setq res (rx-filter (lambda (c) (<= start c)) cs res)))) + (rx-alt (rx-sort-int res)))) + ((rx-clo-p rx) (rx-clo (rx-cc-to-pc (cdr rx) cs))) + ((rx-alt-p rx) (rx-alt (mapcar (lambda (r) (rx-cc-to-pc r cs)) (cdr rx)))) + ((rx-con-p rx) (rx-con (rx-cc-to-pc (car rx) cs) (rx-cc-to-pc (cdr rx) cs))) + (t (error "not rx %s" rx)))) + +(defun rx-categolize-char (rx) + (let ((cs (rx-sort-int (rx-collect-cc rx)))) + (cons + (rx-cc-to-pc rx cs) + cs))) + +;;; simplification + +(defun rx-nullable-p (rx) + (cond + ((rx-empty-p rx) nil) + ((rx-null-p rx) t) + ((rx-act-p rx) t) + ((rx-pc-p rx) nil) + ((rx-clo-p rx) t) + ((rx-alt-p rx) + (setq rx (cdr rx)) + (while (and (consp rx) (not (rx-nullable-p (car rx)))) + (setq rx (cdr rx))) + (consp rx)) + ((rx-con-p rx) + (and (rx-nullable-p (car rx)) (rx-nullable-p (cdr rx)))) + (t (error "not rx %s" rx)))) + +(defun rx-simplify (rx) + (cond + ((rx-empty-p rx) rx) + ((rx-null-p rx) rx) + ((rx-act-p rx) rx) + ((rx-pc-p rx) rx) + ((rx-clo-p rx) + (rx-clo (rx-simplify (cdr rx)))) + ((rx-alt-p rx) + (let ((in (cdr rx)) (out ()) + already-simplified-list already-simplified) + (while (consp in) + (setq rx (car in) + in (cdr in) + already-simplified (car already-simplified-list) + already-simplified-list (cdr already-simplified-list)) + (if (rx-alt-p rx) + (setq in (append (cdr rx) in)) + (progn + (setq rx (if already-simplified rx (rx-simplify rx))) + (cond + ((rx-empty-p rx)) ; [] is identity element for alternation. + ((rx-alt-p rx) + (setq in (append (cdr rx) in) + already-simplified-list (append (make-list (length (cdr rx)) nil) already-simplified-list))) + ((not (member rx out)) + (setq out (cons rx out))))))) + (rx-alt (rx-sort-rx (reverse out))))) + ((rx-con-p rx) + (catch 'return + (let ((in (list (car rx) (cdr rx))) (out ()) + already-simplified-list already-simplified) + (while (consp in) + (setq rx (car in) + in (cdr in) + already-simplified (car already-simplified-list) + already-simplified-list (cdr already-simplified-list)) + (if (rx-con-p rx) + (setq in (rx-cons* (car rx) (cdr rx) in)) + (progn + (setq rx (if already-simplified rx (rx-simplify rx))) + (cond + ((rx-empty-p rx) ; [] is zero element for concatination. + (throw 'return [])) + ((rx-null-p rx)) ; () is identity element for concatination. + ((rx-con-p rx) + (setq in (rx-cons* (car rx) (cdr rx) in)) + already-simplified-list (rx-cons* t t already-simplified-list)) + (t + (setq out (cons rx out))))))) + (if (= (length out) 1) + (car out) + (nreverse out))))) + (t (error "not rx %s" rx)))) + +;;; head property + +(defun rx-head-pcs (rx &optional res) + (cond + ((rx-empty-p rx) res) + ((rx-null-p rx) res) + ((rx-act-p rx) res) + ((rx-pc-p rx) (if (member rx res) res (cons rx res))) + ((rx-clo-p rx) (rx-head-pcs (cdr rx) res)) + ((rx-alt-p rx) + (setq rx (cdr rx)) + (while (consp rx) + (setq res (rx-head-pcs (car rx) res) + rx (cdr rx))) + res) + ((rx-con-p rx) + (setq res (rx-head-pcs (car rx) res)) + (if (rx-nullable-p (car rx)) + (setq res (rx-head-pcs (cdr rx) res))) + res) + (t (error "not rx %s" rx)))) + +(defun rx-head-act (rx &optional res) + (cond + ((rx-empty-p rx) res) + ((rx-null-p rx) res) + ((rx-act-p rx) (rx-better-act rx res)) + ((rx-pc-p rx) res) + ((rx-clo-p rx) (rx-head-act (cdr rx) res)) + ((rx-alt-p rx) + (setq rx (cdr rx)) + (while (consp rx) + (setq res (rx-head-act (car rx) res) + rx (cdr rx))) + res) + ((rx-con-p rx) + (setq res (rx-head-act (car rx) res)) + (if (rx-nullable-p (car rx)) + (setq res (rx-head-act (cdr rx) res))) + res) + (t (error "not rx %s" rx)))) + +;;; stepping + +(defun rx-step-internal (rx pc) + (cond + ((rx-empty-p rx) []) + ((rx-null-p rx) []) + ((rx-act-p rx) []) + ((rx-pc-p rx) (if (= rx pc) () [])) + ((rx-clo-p rx) (rx-con (rx-step-internal (cdr rx) pc) rx)) + ((rx-alt-p rx) (rx-alt (mapcar (lambda (r) (rx-step-internal r pc)) (cdr rx)))) + ((rx-con-p rx) + (if (rx-nullable-p (car rx)) + (rx-alt2 + (rx-con (rx-step-internal (car rx) pc) (cdr rx)) + (rx-step-internal (cdr rx) pc)) + (rx-con (rx-step-internal (car rx) pc) (cdr rx)))) + (t (error "not rx %s" rx)))) + +(defun rx-step (rx &rest pcs) + (while (consp pcs) + (setq rx (rx-simplify (rx-step-internal rx (car pcs))) + pcs (cdr pcs))) + rx) + +;;; utilities + +(defun rx-better-act (a1 a2) + (cond + ((null a2) a1) + ((< (cdr a1) (cdr a2)) a1) + (t a2))) + +(defun rx-cons* (elt &rest lst) + (if (null lst) + elt + (cons elt (apply 'rx-cons* (car lst) (cdr lst))))) + +(defun rx-filter (fun lst &optional rest) + (if (null lst) + rest + (if (funcall fun (car lst)) + (cons (car lst) (rx-filter fun (cdr lst) rest)) + (rx-filter fun (cdr lst) rest)))) + +(defun rx-cmp-index (rx) + (cond + ((rx-null-p rx) (list 0)) + ((rx-act-p rx) (list 1 (cdr rx))) + ((rx-empty-p rx) (list 2)) + ((rx-clo-p rx) (list 3 (cdr rx))) + ((rx-alt-p rx) (cons 4 (cdr rx))) + ((rx-con-p rx) (list 5 (car rx) (cdr rx))) + ((rx-pc-p rx) (list 6 rx)) + (t (error "not rx %s" rx)))) + +(defun rx-cmp-int (i1 i2) + (cond + ((< i1 i2) -1) + ((> i1 i2) 1) + (t 0))) + +(defun rx-cmp-rx (r1 r2) + (let ((i1 (rx-cmp-index r1)) (i2 (rx-cmp-index r2))) + (cond + ((< (car i1) (car i2)) -1) + ((> (car i1) (car i2)) 1) + (t (setq i1 (cdr i1) + i2 (cdr i2)) + (catch 'result + (while (and (consp i1) (consp i2)) + (let ((r (if (and (integerp (car i1)) (integerp (car i2))) + (rx-cmp-int (car i1) (car i2)) + (rx-cmp-rx (car i1) (car i2))))) + (if (not (zerop r)) + (throw 'result r) + (setq i1 (cdr i1) + i2 (cdr i2))))) + (if (null i1) (if (null i2) 0 -1) 1)))))) + +(defun rx-sort-rx (l &optional res) + (if (null l) + res + (let ((e (car l)) lt gt cmp) + (setq l (cdr l)) + (while (consp l) + (setq cmp (rx-cmp-rx (car l) e)) + (cond + ((< cmp 0) (setq lt (cons (car l) lt))) + ((< 0 cmp) (setq gt (cons (car l) gt)))) + (setq l (cdr l))) + (rx-sort-rx lt (cons e (rx-sort-rx gt res)))))) + +(defun rx-sort-int (l &optional res) + (if (null l) + res + (let ((e (car l)) lt gt) + (setq l (cdr l)) + (while (consp l) + (cond + ((< (car l) e) (setq lt (cons (car l) lt))) + ((< e (car l)) (setq gt (cons (car l) gt)))) + (setq l (cdr l))) + (rx-sort-int lt (cons e (rx-sort-int gt res)))))) + -- 1.7.10.4