From d194289740ddc54242af58db9370f77b73ba438a Mon Sep 17 00:00:00 2001 From: Thomas Hintz Date: Fri, 17 Oct 2014 06:52:56 -0700 Subject: [PATCH] Switch to comparse for UTF8 validation. --- utf8-grammar.scm | 76 ++++++++++++++++++++++++++++++++++++++++++++++++ websockets.scm | 54 +++------------------------------- 2 files changed, 80 insertions(+), 50 deletions(-) create mode 100644 utf8-grammar.scm diff --git a/utf8-grammar.scm b/utf8-grammar.scm new file mode 100644 index 0000000..ec266ae --- /dev/null +++ b/utf8-grammar.scm @@ -0,0 +1,76 @@ +(import chicken scheme) +(use srfi-4 srfi-13 srfi-14 comparse) + + +(define (ucs-range->char-set/inclusive lower upper) + (ucs-range->char-set lower (add1 upper))) + +(define utf8-tail + (in (ucs-range->char-set/inclusive #x80 #xBF))) + +(define utf8-1 + (in (ucs-range->char-set/inclusive #x00 #x7F))) + +(define utf8-2 + (sequence + (in (ucs-range->char-set/inclusive #xC2 #xDF)) + utf8-tail)) + +(define utf8-3 + (any-of + (sequence + (is #\xE0) + (in (ucs-range->char-set/inclusive #xA0 #xBF)) + utf8-tail) + (sequence + (in (ucs-range->char-set/inclusive #xE1 #xEC)) + (repeated utf8-tail 2)) + (sequence + (is #\xED) + (in (ucs-range->char-set/inclusive #x80 #x9F)) + utf8-tail) + (sequence + (in (ucs-range->char-set/inclusive #xEE #xEF)) + (repeated utf8-tail 2)))) + +(define utf8-4 + (any-of + (sequence + (is #\xF0) + (in (ucs-range->char-set/inclusive #x90 #xBF)) + (repeated utf8-tail 2)) + (sequence + (in (ucs-range->char-set/inclusive #xF1 #xF3)) + (repeated utf8-tail 3)) + (sequence + (is #\xF4) + (in (ucs-range->char-set/inclusive #x80 #x8F)) + (repeated utf8-tail 2)))) + +(define utf8-char + (any-of + utf8-1 + utf8-2 + utf8-3 + utf8-4)) + +(define utf8-string + (followed-by (zero-or-more utf8-char) end-of-input)) + +;; (parse utf8-string (->parser-input "Hello-µ@ßöäüàá-UTF-8!!")) +;; (parse utf8-char (->parser-input #\a)) + +;; (define (valid-utf8? s) +;; (let ((len (string-length s))) +;; (let loop ((i 0)) +;; (if (= i len) +;; #t +;; (let ((r (parse utf8-char (->parser-input (->string (string-ref s i)))))) +;; (if r +;; (loop (+ i (length r))) +;; (string-ref s i))))))) +;; (valid-utf8? "Hello-µ@ßöäüàá-UTF-8!!") +;; (valid-utf8? "Hello") +;; (parse utf8-char (->parser-input (->string #\H))) + +;; #\xC0 diff --git a/websockets.scm b/websockets.scm index 655fa48..ceae0b9 100644 --- a/websockets.scm +++ b/websockets.scm @@ -25,9 +25,7 @@ (import chicken scheme data-structures extras ports posix foreign) (use srfi-1 srfi-4 spiffy intarweb uri-common base64 simple-sha1 srfi-18 - srfi-13 mailbox) - -(foreign-declare "#include \"utf8validator.c\"") + srfi-13 mailbox srfi-14 comparse) (define-inline (neq? obj1 obj2) (not (eq? obj1 obj2))) @@ -186,7 +184,7 @@ (dynamic-wind (lambda () (mutex-lock! (websocket-send-mutex ws))) (lambda () (send-frame ws optype data #t)) - (lambda () (mutex-unlock! (websocket-send-mutex ws)))) + (lambda () (mutex-unlock! (websocket-send-mutex ws))))) (define (websocket-unmask-frame-payload payload len frame-masking-key) (define tmaskkey (make-u8vector 4 #f #t #t)) @@ -339,53 +337,9 @@ (make-property-condition 'unhandled-optype 'optype frame-optype))))))))))) -(define (valid-utf8-2? s) - (define-external str c-string s) - (define-external len int (string-length s)) - (zero? - ((foreign-lambda* int () -" -static const uint8_t utf8d[] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df - 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef - 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff - 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 - 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 - 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 - 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 -}; - - uint32_t si; - uint32_t *state; - si = 0; - state = &si; - uint32_t type; - - for (int i = 0; i < len; i++) { - // type = utf8d[(uint8_t)str[i]]; - type = utf8d[*((uint8_t*)str)]; - *state = utf8d[256 + (*state) * 16 + type]; - - if (*state != 0) // reject - break; - } - - C_return(*state); -" -)) - )) - +(include "utf8-grammar.scm") (define (valid-utf8? s) - (let ((len (string-length s))) - ((foreign-lambda int "utf8_valid" scheme-pointer int) - s len))) + (parse utf8-string (->parser-input s) memoize: #t)) (define (close-code->integer s) (if (string-null? s)