mirror of
				https://github.com/asterisk/asterisk.git
				synced 2025-10-29 23:39:35 +00:00 
			
		
		
		
	
		
			
	
	
		
			381 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			381 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|   | /*
 | ||
|  |  * Asterisk -- An open source telephony toolkit. | ||
|  |  * | ||
|  |  * Copyright (C) 2020, Sean Bright | ||
|  |  * | ||
|  |  * Sean Bright <sean.bright@gmail.com> | ||
|  |  * | ||
|  |  * See http://www.asterisk.org for more information about
 | ||
|  |  * the Asterisk project. Please do not directly contact | ||
|  |  * any of the maintainers of this project for assistance; | ||
|  |  * the project provides a web site, mailing lists and IRC | ||
|  |  * channels for your use. | ||
|  |  * | ||
|  |  * This program is free software, distributed under the terms of | ||
|  |  * the GNU General Public License Version 2. See the LICENSE file | ||
|  |  * at the top of the source tree. | ||
|  |  */ | ||
|  | 
 | ||
|  | /*! \file
 | ||
|  |  * | ||
|  |  * \brief UTF-8 information and validation functions | ||
|  |  */ | ||
|  | 
 | ||
|  | /*** MODULEINFO
 | ||
|  | 	 <support_level>core</support_level> | ||
|  | ***/ | ||
|  | 
 | ||
|  | #include "asterisk.h"
 | ||
|  | 
 | ||
|  | #include "asterisk/utils.h"
 | ||
|  | #include "asterisk/utf8.h"
 | ||
|  | #include "asterisk/test.h"
 | ||
|  | 
 | ||
|  | /*
 | ||
|  |  * BEGIN THIRD PARTY CODE | ||
|  |  * | ||
|  |  * Copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de> | ||
|  |  * | ||
|  |  * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
|  |  * of this software and associated documentation files (the "Software"), to deal | ||
|  |  * in the Software without restriction, including without limitation the rights | ||
|  |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
|  |  * copies of the Software, and to permit persons to whom the Software is | ||
|  |  * furnished to do so, subject to the following conditions: | ||
|  |  * | ||
|  |  * The above copyright notice and this permission notice shall be included in all | ||
|  |  * copies or substantial portions of the Software. | ||
|  |  * | ||
|  |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
|  |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
|  |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
|  |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
|  |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
|  |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
|  |  * SOFTWARE. | ||
|  |  * | ||
|  |  * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
 | ||
|  |  */ | ||
|  | 
 | ||
|  | #define UTF8_ACCEPT 0
 | ||
|  | #define UTF8_REJECT 12
 | ||
|  | 
 | ||
|  | static const uint8_t utf8d[] = { | ||
|  | 	/* The first part of the table maps bytes to character classes that
 | ||
|  | 	 * to reduce the size of the transition table and create bitmasks. */ | ||
|  | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
|  | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
|  | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
|  | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
|  | 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | ||
|  | 	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | ||
|  | 	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | ||
|  | 	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | ||
|  | 
 | ||
|  | 	/* The second part is a transition table that maps a combination
 | ||
|  | 	 * of a state of the automaton and a character class to a state. */ | ||
|  | 	0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | ||
|  | 	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | ||
|  | 	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | ||
|  | 	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | ||
|  | 	12,36,12,12,12,12,12,12,12,12,12,12, | ||
|  | }; | ||
|  | 
 | ||
|  | #if 0
 | ||
|  | /* We can bring this back if we need the codepoint? */ | ||
|  | static uint32_t inline decode(uint32_t *state, uint32_t *codep, uint32_t byte) { | ||
|  | 	uint32_t type = utf8d[byte]; | ||
|  | 
 | ||
|  | 	*codep = (*state != UTF8_ACCEPT) ? | ||
|  | 		(byte & 0x3fu) | (*codep << 6) : | ||
|  | 		(0xff >> type) & (byte); | ||
|  | 
 | ||
|  | 	*state = utf8d[256 + *state + type]; | ||
|  | 	return *state; | ||
|  | } | ||
|  | #endif
 | ||
|  | 
 | ||
|  | static uint32_t inline decode(uint32_t *state, uint32_t byte) { | ||
|  | 	uint32_t type = utf8d[byte]; | ||
|  | 	*state = utf8d[256 + *state + type]; | ||
|  | 	return *state; | ||
|  | } | ||
|  | 
 | ||
|  | /*
 | ||
|  |  * END THIRD PARTY CODE | ||
|  |  * | ||
|  |  * See copyright notice above. | ||
|  |  */ | ||
|  | 
 | ||
|  | int ast_utf8_is_valid(const char *src) | ||
|  | { | ||
|  | 	uint32_t state = UTF8_ACCEPT; | ||
|  | 
 | ||
|  | 	while (*src) { | ||
|  | 		decode(&state, (uint8_t) *src++); | ||
|  | 	} | ||
|  | 
 | ||
|  | 	return state == UTF8_ACCEPT; | ||
|  | } | ||
|  | 
 | ||
|  | int ast_utf8_is_validn(const char *src, size_t size) | ||
|  | { | ||
|  | 	uint32_t state = UTF8_ACCEPT; | ||
|  | 
 | ||
|  | 	while (size && *src) { | ||
|  | 		decode(&state, (uint8_t) *src++); | ||
|  | 		size--; | ||
|  | 	} | ||
|  | 
 | ||
|  | 	return state == UTF8_ACCEPT; | ||
|  | } | ||
|  | 
 | ||
|  | void ast_utf8_copy_string(char *dst, const char *src, size_t size) | ||
|  | { | ||
|  | 	uint32_t state = UTF8_ACCEPT; | ||
|  | 	char *last_good = dst; | ||
|  | 
 | ||
|  | 	ast_assert(size > 0); | ||
|  | 
 | ||
|  | 	while (size && *src) { | ||
|  | 		if (decode(&state, (uint8_t) *src) == UTF8_REJECT) { | ||
|  | 			/* We _could_ replace with U+FFFD and try to recover, but for now
 | ||
|  | 			 * we treat this the same as if we had run out of space */ | ||
|  | 			break; | ||
|  | 		} | ||
|  | 
 | ||
|  | 		*dst++ = *src++; | ||
|  | 		size--; | ||
|  | 
 | ||
|  | 		if (size && state == UTF8_ACCEPT) { | ||
|  | 			/* last_good is where we will ultimately write the 0 byte */ | ||
|  | 			last_good = dst; | ||
|  | 		} | ||
|  | 	} | ||
|  | 
 | ||
|  | 	*last_good = '\0'; | ||
|  | } | ||
|  | 
 | ||
|  | struct ast_utf8_validator { | ||
|  | 	uint32_t state; | ||
|  | }; | ||
|  | 
 | ||
|  | int ast_utf8_validator_new(struct ast_utf8_validator **validator) | ||
|  | { | ||
|  | 	struct ast_utf8_validator *tmp = ast_malloc(sizeof(*tmp)); | ||
|  | 
 | ||
|  | 	if (!tmp) { | ||
|  | 		return 1; | ||
|  | 	} | ||
|  | 
 | ||
|  | 	tmp->state = UTF8_ACCEPT; | ||
|  | 	*validator = tmp; | ||
|  | 	return 0; | ||
|  | } | ||
|  | 
 | ||
|  | enum ast_utf8_validation_result ast_utf8_validator_state( | ||
|  | 	struct ast_utf8_validator *validator) | ||
|  | { | ||
|  | 	switch (validator->state) { | ||
|  | 	case UTF8_ACCEPT: | ||
|  | 		return AST_UTF8_VALID; | ||
|  | 	case UTF8_REJECT: | ||
|  | 		return AST_UTF8_INVALID; | ||
|  | 	default: | ||
|  | 		return AST_UTF8_UNKNOWN; | ||
|  | 	} | ||
|  | } | ||
|  | 
 | ||
|  | enum ast_utf8_validation_result ast_utf8_validator_feed( | ||
|  | 	struct ast_utf8_validator *validator, const char *data) | ||
|  | { | ||
|  | 	while (*data) { | ||
|  | 		decode(&validator->state, (uint8_t) *data++); | ||
|  | 	} | ||
|  | 
 | ||
|  | 	return ast_utf8_validator_state(validator); | ||
|  | } | ||
|  | 
 | ||
|  | enum ast_utf8_validation_result ast_utf8_validator_feedn( | ||
|  | 	struct ast_utf8_validator *validator, const char *data, size_t size) | ||
|  | { | ||
|  | 	while (size && *data) { | ||
|  | 		decode(&validator->state, (uint8_t) *data++); | ||
|  | 		size--; | ||
|  | 	} | ||
|  | 
 | ||
|  | 	return ast_utf8_validator_state(validator); | ||
|  | } | ||
|  | 
 | ||
|  | void ast_utf8_validator_reset(struct ast_utf8_validator *validator) | ||
|  | { | ||
|  | 	validator->state = UTF8_ACCEPT; | ||
|  | } | ||
|  | 
 | ||
|  | void ast_utf8_validator_destroy(struct ast_utf8_validator *validator) | ||
|  | { | ||
|  | 	ast_free(validator); | ||
|  | } | ||
|  | 
 | ||
|  | #ifdef TEST_FRAMEWORK
 | ||
|  | 
 | ||
|  | AST_TEST_DEFINE(test_utf8_is_valid) | ||
|  | { | ||
|  | 	switch (cmd) { | ||
|  | 	case TEST_INIT: | ||
|  | 		info->name = "is_valid"; | ||
|  | 		info->category = "/main/utf8/"; | ||
|  | 		info->summary = "Test ast_utf8_is_valid and ast_utf8_is_validn"; | ||
|  | 		info->description = | ||
|  | 			"Tests UTF-8 string validation code."; | ||
|  | 		return AST_TEST_NOT_RUN; | ||
|  | 	case TEST_EXECUTE: | ||
|  | 		break; | ||
|  | 	} | ||
|  | 
 | ||
|  | 	/* Valid UTF-8 */ | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("Asterisk")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("\xce\xbb")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e")); | ||
|  | 
 | ||
|  | 	/* Valid with leading */ | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e")); | ||
|  | 
 | ||
|  | 	/* Valid with trailing */ | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("Asterisk aaa")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("\xce\xbb aaa")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b aaa")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e aaa")); | ||
|  | 
 | ||
|  | 	/* Valid with leading and trailing */ | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk aaa")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb aaa")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b aaa")); | ||
|  | 	ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e aaa")); | ||
|  | 
 | ||
|  | 	/* Valid if limited by number of bytes */ | ||
|  | 	ast_test_validate(test, ast_utf8_is_validn("Asterisk" "\xff", strlen("Asterisk"))); | ||
|  | 	ast_test_validate(test, ast_utf8_is_validn("\xce\xbb" "\xff", strlen("\xce\xbb"))); | ||
|  | 	ast_test_validate(test, ast_utf8_is_validn("\xe2\x8a\x9b" "\xff", strlen("\xe2\x8a\x9b"))); | ||
|  | 	ast_test_validate(test, ast_utf8_is_validn("\xf0\x9f\x93\x9e" "\xff", strlen("\xf0\x9f\x93\x9e"))); | ||
|  | 
 | ||
|  | 	/* Invalid */ | ||
|  | 	ast_test_validate(test, !ast_utf8_is_valid("\xc0\x8a")); /* Overlong */ | ||
|  | 	ast_test_validate(test, !ast_utf8_is_valid("98.6\xa7")); /* 'High ASCII' */ | ||
|  | 	ast_test_validate(test, !ast_utf8_is_valid("\xc3\x28")); | ||
|  | 	ast_test_validate(test, !ast_utf8_is_valid("\xa0\xa1")); | ||
|  | 	ast_test_validate(test, !ast_utf8_is_valid("\xe2\x28\xa1")); | ||
|  | 	ast_test_validate(test, !ast_utf8_is_valid("\xe2\x82\x28")); | ||
|  | 	ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\xbc")); | ||
|  | 	ast_test_validate(test, !ast_utf8_is_valid("\xf0\x90\x28\xbc")); | ||
|  | 	ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\x28")); | ||
|  | 
 | ||
|  | 	return AST_TEST_PASS; | ||
|  | } | ||
|  | 
 | ||
|  | static int test_copy_and_compare(const char *src, size_t dst_len, const char *cmp) | ||
|  | { | ||
|  | 	char dst[dst_len]; | ||
|  | 	ast_utf8_copy_string(dst, src, dst_len); | ||
|  | 	return strcmp(dst, cmp) == 0; | ||
|  | } | ||
|  | 
 | ||
|  | AST_TEST_DEFINE(test_utf8_copy_string) | ||
|  | { | ||
|  | 	switch (cmd) { | ||
|  | 	case TEST_INIT: | ||
|  | 		info->name = "copy_string"; | ||
|  | 		info->category = "/main/utf8/"; | ||
|  | 		info->summary = "Test ast_utf8_copy_string"; | ||
|  | 		info->description = | ||
|  | 			"Tests UTF-8 string copying code."; | ||
|  | 		return AST_TEST_NOT_RUN; | ||
|  | 	case TEST_EXECUTE: | ||
|  | 		break; | ||
|  | 	} | ||
|  | 
 | ||
|  | 	ast_test_validate(test, test_copy_and_compare("Asterisk",           6, "Aster")); | ||
|  | 	ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 11, "Asterisk ")); | ||
|  | 	ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 12, "Asterisk \xc2\xae")); | ||
|  | 	ast_test_validate(test, test_copy_and_compare("Asterisk \xc0\x8a", 12, "Asterisk ")); | ||
|  | 	ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 1, "")); | ||
|  | 	ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 2, "")); | ||
|  | 	ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 3, "\xce\xbb")); | ||
|  | 	ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 4, "\xce\xbb ")); | ||
|  | 	ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 5, "\xce\xbb x")); | ||
|  | 	ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 6, "\xce\xbb xy")); | ||
|  | 	ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 7, "\xce\xbb xyz")); | ||
|  | 
 | ||
|  | 	return AST_TEST_PASS; | ||
|  | } | ||
|  | 
 | ||
|  | AST_TEST_DEFINE(test_utf8_validator) | ||
|  | { | ||
|  | 	struct ast_utf8_validator *validator; | ||
|  | 
 | ||
|  | 	switch (cmd) { | ||
|  | 	case TEST_INIT: | ||
|  | 		info->name = "utf8_validator"; | ||
|  | 		info->category = "/main/utf8/"; | ||
|  | 		info->summary = "Test ast_utf8_validator"; | ||
|  | 		info->description = | ||
|  | 			"Tests UTF-8 progressive validator code."; | ||
|  | 		return AST_TEST_NOT_RUN; | ||
|  | 	case TEST_EXECUTE: | ||
|  | 		break; | ||
|  | 	} | ||
|  | 
 | ||
|  | 	if (ast_utf8_validator_new(&validator)) { | ||
|  | 		return AST_TEST_FAIL; | ||
|  | 	} | ||
|  | 
 | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "Asterisk") == AST_UTF8_VALID); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc2")     == AST_UTF8_UNKNOWN); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "\xae")     == AST_UTF8_VALID); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "Private")  == AST_UTF8_VALID); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "Branch")   == AST_UTF8_VALID); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "Exchange") == AST_UTF8_VALID); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "\xe2")     == AST_UTF8_UNKNOWN); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "\x84")     == AST_UTF8_UNKNOWN); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "\xbb")     == AST_UTF8_VALID); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc0\x8a") == AST_UTF8_INVALID); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "valid")    == AST_UTF8_INVALID); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "valid")    == AST_UTF8_INVALID); | ||
|  | 	ast_test_validate(test, ast_utf8_validator_feed(validator, "valid")    == AST_UTF8_INVALID); | ||
|  | 
 | ||
|  | 	ast_utf8_validator_destroy(validator); | ||
|  | 
 | ||
|  | 	return AST_TEST_PASS; | ||
|  | } | ||
|  | 
 | ||
|  | static void test_utf8_shutdown(void) | ||
|  | { | ||
|  | 	AST_TEST_UNREGISTER(test_utf8_is_valid); | ||
|  | 	AST_TEST_UNREGISTER(test_utf8_copy_string); | ||
|  | 	AST_TEST_UNREGISTER(test_utf8_validator); | ||
|  | } | ||
|  | 
 | ||
|  | int ast_utf8_init(void) | ||
|  | { | ||
|  | 	AST_TEST_REGISTER(test_utf8_is_valid); | ||
|  | 	AST_TEST_REGISTER(test_utf8_copy_string); | ||
|  | 	AST_TEST_REGISTER(test_utf8_validator); | ||
|  | 
 | ||
|  | 	ast_register_cleanup(test_utf8_shutdown); | ||
|  | 
 | ||
|  | 	return 0; | ||
|  | } | ||
|  | 
 | ||
|  | #else /* !TEST_FRAMEWORK */
 | ||
|  | 
 | ||
|  | int ast_utf8_init(void) | ||
|  | { | ||
|  | 	return 0; | ||
|  | } | ||
|  | 
 | ||
|  | #endif
 |