;;separate-sticked-comment ; violate the policy preserving absence of space
;;separate-sticked-special ; violate the policy preserving absence of space
;;zero-characters-encoded-word-hack ; violate common sense (very tricky)
-;;embedded-encoded-word ; violate character sequence semantics
+;;embedded-encoded-word ; violate character sequence semantics
;;separate-sticked-tokens-for-fold ; violate the policy preserving absence of space
;;encode-long-ascii-string-for-fold ; violate the policy preserving US-ASCII string
-;;divide-atom-for-fold ; violate tne policy preserving absence of space
+;;divide-atom-for-fold ; violate tne policy preserving absence of space
;;; test driver
(decode
"From: Nathaniel Borenstein <nsb@thumper.bellcore.com>\r
- (=?iso-8859-8?b?7eXs+SDv4SDp7Oj08A==?=)"
-"From: Nathaniel Borenstein <nsb@thumper.bellcore.com> (\e.H\eNm\eNe\eNl\eNy \eNo\eNa \eNi\eNl\eNh\eNt\eNp)")\e*B
+\t(=?iso-8859-8?b?7eXs+SDv4SDp7Oj08A==?=)"
+"From: Nathaniel Borenstein <nsb@thumper.bellcore.com>\t(\e.H\eNm\eNe\eNl\eNy \eNo\eNa \eNi\eNl\eNh\eNt\eNp)")\e*B
(decode
"From: foo@bar.baz (=?ISO-8859-1?Q?a?=)"
(decode
"From: foo@bar.baz (=?ISO-8859-1?Q?a?=\r
- =?ISO-8859-1?Q?b?=)"
+\t=?ISO-8859-1?Q?b?=)"
"From: foo@bar.baz (ab)")
(decode
(decode
"Subject: (=?ISO-8859-1?Q?a?=\r
- =?ISO-8859-1?Q?b?=)"
-"Subject: (=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)")
+\t=?ISO-8859-1?Q?b?=)"
+"Subject: (=?ISO-8859-1?Q?a?=\t=?ISO-8859-1?Q?b?=)")
(decode
"Subject: (=?ISO-8859-1?Q?a_b?=)"
(decode
"From: =?ISO-2022-JP?B?GyRCJCIbKEI=?=\r
- =?ISO-2022-JP?B?GyRCJCQbKEI=?=\r
+\t=?ISO-2022-JP?B?GyRCJCQbKEI=?=\r
<akr@jaist.ac.jp>"
"From: \e$B$"$$\e(B <akr@jaist.ac.jp>")
(decode
"Subject: =?Unknown-Charset?B?gqA=?=\r
- =?Unknown-Charset?B?gqA=?="
+\t=?Unknown-Charset?B?gqA=?="
"Subject: =?+Unknown-Charset?B?gqA=?==?+Unknown-Charset?B?gqA=?="
'embedded-encoded-word)
"From: akr@jaist.ac.jp (\r=?US-ASCII?Q?=0A?= )")
(decode
-"From: akr@jaist.ac.jp (\r=?US-ASCII?Q?=0A?= )"
-"From: akr@jaist.ac.jp (\r=?US-ASCII?Q?=0A?= )")
+"From: akr@jaist.ac.jp (\r=?US-ASCII?Q?=0A?=\t)"
+"From: akr@jaist.ac.jp (\r=?US-ASCII?Q?=0A?=\t)")
(decode
"From: akr@jaist.ac.jp (\\\r\\\n)"
(decode
"From: akr@jaist.ac.jp (\\\r=?US-ASCII?Q?=0A=09?=)"
-"From: akr@jaist.ac.jp (\\ )")
+"From: akr@jaist.ac.jp (\\\t)")
(decode
"From: akr@jaist.ac.jp (\\\r=?US-ASCII?Q?=0A?= )"
"From: akr@jaist.ac.jp (\\ )")
(decode
-"From: akr@jaist.ac.jp (\\\r=?US-ASCII?Q?=0A?= )"
-"From: akr@jaist.ac.jp (\\ )")
+"From: akr@jaist.ac.jp (\\\r=?US-ASCII?Q?=0A?=\t)"
+"From: akr@jaist.ac.jp (\\\t)")
(decode
"From: akr@jaist.ac.jp (\\\r=?US-ASCII?Q?=0A?=\r
(put 'lex-scan-multibyte 'lisp-indent-function 3)
(put 'lex-scan-unibyte 'lisp-indent-function 3)
+;;;
+
+(eval-and-compile
+(defvar lex-use-ccl (fboundp 'ccl-execute-on-string))
+(when lex-use-ccl
+ (require 'ccl))
+)
+
+;;; user interface macro
+
+;;; multibyte
+
+(defvar lex-scan-multibyte-str-var (make-symbol "str"))
+(defvar lex-scan-multibyte-ptr-var (make-symbol "ptr"))
+(defvar lex-scan-multibyte-end-var (make-symbol "end"))
+(defvar lex-scan-multibyte-mch-var (make-symbol "mch"))
+
+(defmacro lex-scan-multibyte-read (pc)
+ `(if (< ,lex-scan-multibyte-ptr-var ,lex-scan-multibyte-end-var)
+ (setq ,pc (sref ,lex-scan-multibyte-str-var ,lex-scan-multibyte-ptr-var)
+ ,lex-scan-multibyte-ptr-var (char-next-index ,pc ,lex-scan-multibyte-ptr-var)
+ ,pc (char-int ,pc))
+ (lex-fail)))
+
+(defmacro lex-scan-multibyte-save ()
+ `(setq ,lex-scan-multibyte-mch-var ,lex-scan-multibyte-ptr-var))
+
+(defmacro lex-scan-multibyte (str start end &rest clauses)
+ (if (not start) (setq start 0))
+ (if (not end) (setq end `(length ,lex-scan-multibyte-str-var)))
+ (let ((id 1) (rx ()) (acts ()) tmp code
+ (restore-code (if (symbolp start) `(setq ,start ,lex-scan-multibyte-mch-var))))
+ (while (consp clauses)
+ (setq rx (cons (rx-con (caar clauses) (rx-act id)) rx)
+ acts (cons (cons id (cons restore-code (cdar clauses))) acts)
+ id (1+ id)
+ clauses (cdr clauses)))
+ (setq rx (rx-alt rx)
+ tmp (rx-categolize-char (rx-desugar rx)))
+ `(let* ((,lex-scan-multibyte-str-var ,str)
+ (,lex-scan-multibyte-ptr-var ,start)
+ (,lex-scan-multibyte-end-var ,end)
+ ,lex-scan-multibyte-mch-var)
+ ,(lex-gen-machine (lex-automata (car tmp)) (cdr tmp) acts 'lex-scan-multibyte-read 'lex-scan-multibyte-save))))
+
+;;; unibyte
+
+(defvar lex-scan-unibyte-str-var (make-symbol "str"))
+(defvar lex-scan-unibyte-ptr-var (make-symbol "ptr"))
+(defvar lex-scan-unibyte-end-var (make-symbol "end"))
+(defvar lex-scan-unibyte-mch-var (make-symbol "mch"))
+
+(defmacro lex-scan-unibyte-read (pc)
+ `(if (< ,lex-scan-unibyte-ptr-var ,lex-scan-unibyte-end-var)
+ (setq ,pc (aref ,lex-scan-unibyte-str-var ,lex-scan-unibyte-ptr-var)
+ ,lex-scan-unibyte-ptr-var (1+ ,lex-scan-unibyte-ptr-var)
+ ,pc (char-int ,pc))
+ (lex-fail)))
+
+(defmacro lex-scan-unibyte-save ()
+ `(setq ,lex-scan-unibyte-mch-var ,lex-scan-unibyte-ptr-var))
+
+(defmacro lex-scan-unibyte (str start end &rest clauses)
+ (if (not start) (setq start 0))
+ (if (not end) (setq end `(length ,lex-scan-unibyte-str-var)))
+ (let ((id 1) (rx ()) (acts ()) tmp code
+ (restore-code (if (symbolp start) `(setq ,start ,lex-scan-unibyte-mch-var))))
+ (while (consp clauses)
+ (setq rx (cons (rx-con (caar clauses) (rx-act id)) rx)
+ acts (cons (cons id (cons restore-code (cdar clauses))) acts)
+ id (1+ id)
+ clauses (cdr clauses)))
+ (setq rx (rx-alt rx)
+ tmp (rx-categolize-char (rx-desugar rx)))
+ `(let* ((,lex-scan-unibyte-str-var ,str)
+ (,lex-scan-unibyte-ptr-var ,start)
+ (,lex-scan-unibyte-end-var ,end)
+ ,lex-scan-unibyte-mch-var)
+ ,(lex-gen-machine (lex-automata (car tmp)) (cdr tmp) acts 'lex-scan-unibyte-read 'lex-scan-unibyte-save))))
+
;;; automata generation
(defun lex-automata (rx)
(defun lex-gen-machine (states cs acts read-macro save-macro)
`(let (,lex-pc-var ,lex-act-var)
- (catch ',lex-escape-tag
- (automata
- ,lex-pc-var 0
- ,@(mapcar
- (lambda (s) (lex-gen-state s cs read-macro save-macro))
- states)))
- (automata-branch
- ,lex-act-var ,(apply 'natset-single (mapcar 'car acts)) automata-never-fail
+ ,(if (and lex-use-ccl
+ (eq read-macro 'lex-scan-unibyte-read)
+ (eq save-macro 'lex-scan-unibyte-save))
+ (lex-gen-ccl-unibyte-automata states cs)
+ (lex-gen-automata states cs read-macro save-macro))
+ ,(lex-gen-action acts)))
+
+(defun lex-gen-automata (states cs read-macro save-macro)
+ `(catch ',lex-escape-tag
+ (automata
+ ,lex-pc-var 0
,@(mapcar
- (lambda (act) `(,(natset-single (car act)) nil ,@(cdr act)))
- acts))))
+ (lambda (s) (lex-gen-state s cs read-macro save-macro))
+ states))))
(defun lex-gen-state (s cs read-macro save-macro)
(let ((id (nth 0 s))
,(cadr tr)))
trans))))
-;;; internal macros
+(defun lex-gen-action (acts)
+ `(automata-branch
+ ,lex-act-var ,(apply 'natset-single (mapcar 'car acts)) automata-never-fail
+ ,@(mapcar
+ (lambda (act) `(,(natset-single (car act)) nil ,@(cdr act)))
+ acts)))
-(defmacro lex-match (id)
- `(setq ,lex-act-var ',id))
-(defmacro lex-fail ()
- `(throw ',lex-escape-tag nil))
-
-;;; user interface macro
-
-;;; multibyte
-
-(defvar lex-scan-multibyte-str-var (make-symbol "str"))
-(defvar lex-scan-multibyte-ptr-var (make-symbol "ptr"))
-(defvar lex-scan-multibyte-end-var (make-symbol "end"))
-(defvar lex-scan-multibyte-mch-var (make-symbol "mch"))
-
-(defmacro lex-scan-multibyte-read (pc)
- `(if (< ,lex-scan-multibyte-ptr-var ,lex-scan-multibyte-end-var)
- (setq ,pc (sref ,lex-scan-multibyte-str-var ,lex-scan-multibyte-ptr-var)
- ,lex-scan-multibyte-ptr-var (char-next-index ,pc ,lex-scan-multibyte-ptr-var)
- ,pc (char-int ,pc))
- (lex-fail)))
-
-(defmacro lex-scan-multibyte-save ()
- `(setq ,lex-scan-multibyte-mch-var ,lex-scan-multibyte-ptr-var))
-
-(defmacro lex-scan-multibyte (str start end &rest clauses)
- (if (not start) (setq start 0))
- (if (not end) (setq end `(length ,lex-scan-multibyte-str-var)))
- (let ((id 1) (rx ()) (acts ()) tmp code
- (restore-code (if (symbolp start) `(setq ,start ,lex-scan-multibyte-mch-var))))
- (while (consp clauses)
- (setq rx (cons (rx-con (caar clauses) (rx-act id)) rx)
- acts (cons (cons id (cons restore-code (cdar clauses))) acts)
- id (1+ id)
- clauses (cdr clauses)))
- (setq rx (rx-alt rx)
- tmp (rx-categolize-char (rx-desugar rx)))
- `(let* ((,lex-scan-multibyte-str-var ,str)
- (,lex-scan-multibyte-ptr-var ,start)
- (,lex-scan-multibyte-end-var ,end)
- ,lex-scan-multibyte-mch-var)
- ,(lex-gen-machine (lex-automata (car tmp)) (cdr tmp) acts 'lex-scan-multibyte-read 'lex-scan-multibyte-save))))
+;;; CCL version automata generation
-;;; unibyte
+(defun lex-gen-ccl-unibyte-automata (states cs)
+ ;; read-macro is lex-scan-unibyte-read
+ ;; save-macro is lex-scan-unibyte-save
+ `(let ((status [nil nil nil nil nil nil nil nil nil]))
+ (aset status 0 nil) ; r0: pc
+ (aset status 1 0) ; r1: state
+ (aset status 2 ,lex-scan-unibyte-ptr-var) ; r2: ptr
+ (aset status 3 ,lex-scan-unibyte-ptr-var) ; r3: start
+ (aset status 4 ,lex-scan-unibyte-end-var) ; r4: end
+ (aset status 5 nil) ; r5: mch
+ (aset status 6 0) ; r6: act
+ (aset status 7 nil) ; r7
+ (aset status 8 nil) ; ic
+ (ccl-execute-on-string
+ (eval-when-compile
+ (ccl-compile
+ ',(lex-gen-ccl-unibyte-automata-program states cs)))
+ status
+ ,lex-scan-unibyte-str-var)
+ (setq ,lex-scan-unibyte-ptr-var (aref status 2))
+ (when (< 0 (aref status 6))
+ (setq ,lex-act-var (aref status 6)
+ ,lex-scan-unibyte-mch-var (aref status 5)))))
-(defvar lex-scan-unibyte-str-var (make-symbol "str"))
-(defvar lex-scan-unibyte-ptr-var (make-symbol "ptr"))
-(defvar lex-scan-unibyte-end-var (make-symbol "end"))
-(defvar lex-scan-unibyte-mch-var (make-symbol "mch"))
+(defun lex-gen-ccl-unibyte-automata-program (states cs)
+ `(0
+ ((loop
+ (if (r3 > 0)
+ ((r3 -= 1)
+ (read r0)
+ (repeat))
+ (break)))
+ (loop
+ (branch r1
+ ,@(mapcar
+ (lambda (s) (lex-gen-ccl-unibyte-automata-state
+ (nth 0 s) (cdr (nth 1 s)) (nth 2 s)
+ cs))
+ (sort states
+ (lambda (a b) (< (car a) (car b))))))))))
-(defmacro lex-scan-unibyte-read (pc)
- `(if (< ,lex-scan-unibyte-ptr-var ,lex-scan-unibyte-end-var)
- (setq ,pc (aref ,lex-scan-unibyte-str-var ,lex-scan-unibyte-ptr-var)
- ,lex-scan-unibyte-ptr-var (1+ ,lex-scan-unibyte-ptr-var)
- ,pc (char-int ,pc))
- (lex-fail)))
+(defun lex-gen-ccl-unibyte-automata-state (id act trans cs)
+ `(,@(when act
+ `((r5 = r2)
+ (r6 = ,act)))
+ ,@(if (consp trans)
+ `((if (r4 <= r2)
+ (end)
+ ((read r0)
+ (r2 += 1)
+ ,(apply
+ 'natset-gen-ccl-branch
+ 'r0
+ '(end)
+ (mapcar
+ (lambda (tr) (cons
+ (let ((l (member (car tr) cs)))
+ (if (null (cdr l))
+ (natset-seg (car l))
+ (natset-seg (car l) (1- (cadr l)))))
+ `(r1 = ,(cadr tr))))
+ trans))
+ (repeat))))
+ '((end)))))
-(defmacro lex-scan-unibyte-save ()
- `(setq ,lex-scan-unibyte-mch-var ,lex-scan-unibyte-ptr-var))
+;;; internal macros
-(defmacro lex-scan-unibyte (str start end &rest clauses)
- (if (not start) (setq start 0))
- (if (not end) (setq end `(length ,lex-scan-unibyte-str-var)))
- (let ((id 1) (rx ()) (acts ()) tmp code
- (restore-code (if (symbolp start) `(setq ,start ,lex-scan-unibyte-mch-var))))
- (while (consp clauses)
- (setq rx (cons (rx-con (caar clauses) (rx-act id)) rx)
- acts (cons (cons id (cons restore-code (cdar clauses))) acts)
- id (1+ id)
- clauses (cdr clauses)))
- (setq rx (rx-alt rx)
- tmp (rx-categolize-char (rx-desugar rx)))
- `(let* ((,lex-scan-unibyte-str-var ,str)
- (,lex-scan-unibyte-ptr-var ,start)
- (,lex-scan-unibyte-end-var ,end)
- ,lex-scan-unibyte-mch-var)
- ,(lex-gen-machine (lex-automata (car tmp)) (cdr tmp) acts 'lex-scan-unibyte-read 'lex-scan-unibyte-save))))
+(defmacro lex-match (id)
+ `(setq ,lex-act-var ',id))
+(defmacro lex-fail ()
+ `(throw ',lex-escape-tag nil))
;;; utilities
"aaa" 0 3
(?a 'a))
+(let* ((str "abcdef\ndeefx\r\n jfdks\r")
+ (p 15))
+ (cons
+ (lex-scan-unibyte str p nil
+ (()
+ 'error)
+ (((* [^ "\r\n"])
+ (* (+ ?\r) [^ "\r\n"] (* [^ "\r"]))
+ (* ?\r)
+ (?\r ?\n [" \t"]))
+ 'line-fold)
+ (((* [^ "\r\n"])
+ (* (+ ?\r) [^ "\r\n"] (* [^ "\r"]))
+ (* ?\r)
+ (?\r ?\n))
+ 'line-crlf)
+ (((* [^ "\r\n"])
+ (* (+ ?\r) [^ "\r\n"] (* [^ "\r"]))
+ (* ?\r))
+ 'line))
+ p))
+
+(ew-crlf-line-convert "abcdef\ndeefx\r\n jfdks\r"
+ (lambda (a) (format "[L:%s]" a))
+ (lambda (a) (format "[F:%s]" a))
+ (lambda (a) (format "[N:%s]" a)))
+
+
)
+