You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
websockets/utf8-grammar.scm

75 lines
1.9 KiB
Scheme

(define (ucs-range->char-set/inclusive lower upper)
(ucs-range->char-set lower (add1 upper)))
(define utf8-tail
(in (ucs-range->char-set/inclusive #x80 #xBF)))
(define utf8-1
(satisfies (lambda (c) (or (< (char->integer c) 128)
(and (> (char->integer c) 128)
(< (char->integer c) 191))))))
(define utf8-2
(sequence
(in (ucs-range->char-set/inclusive #xC2 #xDF))
utf8-tail))
(define utf8-3
(any-of
(sequence
(is #\xE0)
(in (ucs-range->char-set/inclusive #xA0 #xBF))
utf8-tail)
(sequence
(in (ucs-range->char-set/inclusive #xE1 #xEC))
(repeated utf8-tail 2))
(sequence
(is #\xED)
(in (ucs-range->char-set/inclusive #x80 #x9F))
utf8-tail)
(sequence
(in (ucs-range->char-set/inclusive #xEE #xEF))
(repeated utf8-tail 2))))
(define utf8-4
(any-of
(sequence
(is #\xF0)
(in (ucs-range->char-set/inclusive #x90 #xBF))
(repeated utf8-tail 2))
(sequence
(in (ucs-range->char-set/inclusive #xF1 #xF3))
(repeated utf8-tail 3))
(sequence
(is #\xF4)
(in (ucs-range->char-set/inclusive #x80 #x8F))
(repeated utf8-tail 2))))
(define utf8-char
(any-of
utf8-1
utf8-2
utf8-3
utf8-4))
(define utf8-string
(followed-by (zero-or-more utf8-char) end-of-input))
;; (parse utf8-string (->parser-input "Hello-µ@ßöäüàá-UTF-8!!"))
;; (parse utf8-char (->parser-input #\a))
;; (define (valid-utf8? s)
;; (let ((len (string-length s)))
;; (let loop ((i 0))
;; (if (= i len)
;; #t
;; (let ((r (parse utf8-char (->parser-input (->string (string-ref s i))))))
;; (if r
;; (loop (+ i (length r)))
;; (string-ref s i)))))))
;; (valid-utf8? "Hello-µ@ßöäüàá-UTF-8!!")
;; (valid-utf8? "Hello")
;; (parse utf8-char (->parser-input (->string #\H)))
;; #\xC0