Add another utf8 decoder.
							parent
							
								
									daef7b3ea4
								
							
						
					
					
						commit
						baf570ab65
					
				@ -0,0 +1,56 @@
 | 
				
			|||||||
 | 
					// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 | 
				
			||||||
 | 
					// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "utf8decode.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define UTF8_ACCEPT 0
 | 
				
			||||||
 | 
					#define UTF8_REJECT 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static const uint8_t utf8d[] = {
 | 
				
			||||||
 | 
					  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
 | 
				
			||||||
 | 
					  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
 | 
				
			||||||
 | 
					  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
 | 
				
			||||||
 | 
					  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
 | 
				
			||||||
 | 
					  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
 | 
				
			||||||
 | 
					  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
 | 
				
			||||||
 | 
					  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
 | 
				
			||||||
 | 
					  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
 | 
				
			||||||
 | 
					  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
 | 
				
			||||||
 | 
					  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
 | 
				
			||||||
 | 
					  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
 | 
				
			||||||
 | 
					  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
 | 
				
			||||||
 | 
					  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
 | 
				
			||||||
 | 
					  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static uint32_t inline
 | 
				
			||||||
 | 
					decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
 | 
				
			||||||
 | 
					  uint32_t type = utf8d[byte];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  *codep = (*state != UTF8_ACCEPT) ?
 | 
				
			||||||
 | 
					    (byte & 0x3fu) | (*codep << 6) :
 | 
				
			||||||
 | 
					    (0xff >> type) & (byte);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  *state = utf8d[256 + *state*16 + type];
 | 
				
			||||||
 | 
					  return *state;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int
 | 
				
			||||||
 | 
					countCodePoints(uint8_t* s, size_t len) {
 | 
				
			||||||
 | 
					  uint32_t codepoint;
 | 
				
			||||||
 | 
					  uint32_t state = 0;
 | 
				
			||||||
 | 
					  size_t count = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//  for (count = 0; *s; ++s)
 | 
				
			||||||
 | 
					  for (int i = len; i != 0; --i) {
 | 
				
			||||||
 | 
					    //if (!decode(&state, &codepoint, *s))
 | 
				
			||||||
 | 
					    ++s;
 | 
				
			||||||
 | 
					    if (!decode(&state, &codepoint, *s)) {
 | 
				
			||||||
 | 
					      count += 1;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  return state != UTF8_ACCEPT;
 | 
				
			||||||
 | 
					//  return state;
 | 
				
			||||||
 | 
					//  return count;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@ -0,0 +1,25 @@
 | 
				
			|||||||
 | 
					#ifndef UTF8_DECODE_H
 | 
				
			||||||
 | 
					#define UTF8_DECODE_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <stdlib.h>
 | 
				
			||||||
 | 
					#include <stdint.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern int countCodePoints(uint8_t* s, size_t count);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* typedef struct {
 | 
				
			||||||
 | 
					   size_t current_index;
 | 
				
			||||||
 | 
					   size_t total_index;
 | 
				
			||||||
 | 
					   int state;
 | 
				
			||||||
 | 
					   int is_valid;
 | 
				
			||||||
 | 
					   int ends_on_codepoint;
 | 
				
			||||||
 | 
					} utf8_validator_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern void utf8vld_reset (utf8_validator_t* validator);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern void utf8vld_validate (utf8_validator_t* validator, const uint8_t* data, size_t offset, size_t length);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern int utf8_valid(const uint8_t* data, size_t len); */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif // UTF8_DECODE_H
 | 
				
			||||||
					Loading…
					
					
				
		Reference in New Issue