Switch to comparse for UTF8 validation.
parent
baf570ab65
commit
d194289740
@ -0,0 +1,76 @@
|
||||
(import chicken scheme)
|
||||
(use srfi-4 srfi-13 srfi-14 comparse)
|
||||
|
||||
|
||||
(define (ucs-range->char-set/inclusive lower upper)
|
||||
(ucs-range->char-set lower (add1 upper)))
|
||||
|
||||
(define utf8-tail
|
||||
(in (ucs-range->char-set/inclusive #x80 #xBF)))
|
||||
|
||||
(define utf8-1
|
||||
(in (ucs-range->char-set/inclusive #x00 #x7F)))
|
||||
|
||||
(define utf8-2
|
||||
(sequence
|
||||
(in (ucs-range->char-set/inclusive #xC2 #xDF))
|
||||
utf8-tail))
|
||||
|
||||
(define utf8-3
|
||||
(any-of
|
||||
(sequence
|
||||
(is #\xE0)
|
||||
(in (ucs-range->char-set/inclusive #xA0 #xBF))
|
||||
utf8-tail)
|
||||
(sequence
|
||||
(in (ucs-range->char-set/inclusive #xE1 #xEC))
|
||||
(repeated utf8-tail 2))
|
||||
(sequence
|
||||
(is #\xED)
|
||||
(in (ucs-range->char-set/inclusive #x80 #x9F))
|
||||
utf8-tail)
|
||||
(sequence
|
||||
(in (ucs-range->char-set/inclusive #xEE #xEF))
|
||||
(repeated utf8-tail 2))))
|
||||
|
||||
(define utf8-4
|
||||
(any-of
|
||||
(sequence
|
||||
(is #\xF0)
|
||||
(in (ucs-range->char-set/inclusive #x90 #xBF))
|
||||
(repeated utf8-tail 2))
|
||||
(sequence
|
||||
(in (ucs-range->char-set/inclusive #xF1 #xF3))
|
||||
(repeated utf8-tail 3))
|
||||
(sequence
|
||||
(is #\xF4)
|
||||
(in (ucs-range->char-set/inclusive #x80 #x8F))
|
||||
(repeated utf8-tail 2))))
|
||||
|
||||
(define utf8-char
|
||||
(any-of
|
||||
utf8-1
|
||||
utf8-2
|
||||
utf8-3
|
||||
utf8-4))
|
||||
|
||||
(define utf8-string
|
||||
(followed-by (zero-or-more utf8-char) end-of-input))
|
||||
|
||||
;; (parse utf8-string (->parser-input "Hello-µ@ßöäüàá-UTF-8!!"))
|
||||
;; (parse utf8-char (->parser-input #\a))
|
||||
|
||||
;; (define (valid-utf8? s)
|
||||
;; (let ((len (string-length s)))
|
||||
;; (let loop ((i 0))
|
||||
;; (if (= i len)
|
||||
;; #t
|
||||
;; (let ((r (parse utf8-char (->parser-input (->string (string-ref s i))))))
|
||||
;; (if r
|
||||
;; (loop (+ i (length r)))
|
||||
;; (string-ref s i)))))))
|
||||
;; (valid-utf8? "Hello-µ@ßöäüàá-UTF-8!!")
|
||||
;; (valid-utf8? "Hello")
|
||||
;; (parse utf8-char (->parser-input (->string #\H)))
|
||||
|
||||
;; #\xC0
|
Loading…
Reference in New Issue