You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
75 lines
1.9 KiB
Scheme
75 lines
1.9 KiB
Scheme
(define (ucs-range->char-set/inclusive lower upper)
|
|
(ucs-range->char-set lower (add1 upper)))
|
|
|
|
(define utf8-tail
|
|
(in (ucs-range->char-set/inclusive #x80 #xBF)))
|
|
|
|
(define utf8-1
|
|
(satisfies (lambda (c) (or (< (char->integer c) 128)
|
|
(and (> (char->integer c) 128)
|
|
(< (char->integer c) 191))))))
|
|
|
|
(define utf8-2
|
|
(sequence
|
|
(in (ucs-range->char-set/inclusive #xC2 #xDF))
|
|
utf8-tail))
|
|
|
|
(define utf8-3
|
|
(any-of
|
|
(sequence
|
|
(is #\xE0)
|
|
(in (ucs-range->char-set/inclusive #xA0 #xBF))
|
|
utf8-tail)
|
|
(sequence
|
|
(in (ucs-range->char-set/inclusive #xE1 #xEC))
|
|
(repeated utf8-tail 2))
|
|
(sequence
|
|
(is #\xED)
|
|
(in (ucs-range->char-set/inclusive #x80 #x9F))
|
|
utf8-tail)
|
|
(sequence
|
|
(in (ucs-range->char-set/inclusive #xEE #xEF))
|
|
(repeated utf8-tail 2))))
|
|
|
|
(define utf8-4
|
|
(any-of
|
|
(sequence
|
|
(is #\xF0)
|
|
(in (ucs-range->char-set/inclusive #x90 #xBF))
|
|
(repeated utf8-tail 2))
|
|
(sequence
|
|
(in (ucs-range->char-set/inclusive #xF1 #xF3))
|
|
(repeated utf8-tail 3))
|
|
(sequence
|
|
(is #\xF4)
|
|
(in (ucs-range->char-set/inclusive #x80 #x8F))
|
|
(repeated utf8-tail 2))))
|
|
|
|
(define utf8-char
|
|
(any-of
|
|
utf8-1
|
|
utf8-2
|
|
utf8-3
|
|
utf8-4))
|
|
|
|
(define utf8-string
|
|
(followed-by (zero-or-more utf8-char) end-of-input))
|
|
|
|
;; (parse utf8-string (->parser-input "Hello-µ@ßöäüàá-UTF-8!!"))
|
|
;; (parse utf8-char (->parser-input #\a))
|
|
|
|
;; (define (valid-utf8? s)
|
|
;; (let ((len (string-length s)))
|
|
;; (let loop ((i 0))
|
|
;; (if (= i len)
|
|
;; #t
|
|
;; (let ((r (parse utf8-char (->parser-input (->string (string-ref s i))))))
|
|
;; (if r
|
|
;; (loop (+ i (length r)))
|
|
;; (string-ref s i)))))))
|
|
;; (valid-utf8? "Hello-µ@ßöäüàá-UTF-8!!")
|
|
;; (valid-utf8? "Hello")
|
|
;; (parse utf8-char (->parser-input (->string #\H)))
|
|
|
|
;; #\xC0
|