[ Avaa Bypassed ]




Upload:

Command:

hmhc3928@3.137.221.114: ~ $
/*
   +----------------------------------------------------------------------+
   | PHP Version 5                                                        |
   +----------------------------------------------------------------------+
   | Copyright (c) 1997-2016 The PHP Group                                |
   +----------------------------------------------------------------------+
   | This source file is subject to version 3.01 of the PHP license,      |
   | that is bundled with this package in the file LICENSE, and is        |
   | available through the world-wide-web at the following url:           |
   | http://www.php.net/license/3_01.txt                                  |
   | If you did not receive a copy of the PHP license and are unable to   |
   | obtain it through the world-wide-web, please send a note to          |
   | license@php.net so we can mail you a copy immediately.               |
   +----------------------------------------------------------------------+
   | Authors: Rasmus Lerdorf <rasmus@php.net>                             |
   |          Jaakko Hyvätti <jaakko.hyvatti@iki.fi>                      |
   |          Wez Furlong    <wez@thebrainroom.com>                       |
   |          Gustavo Lopes  <cataphract@php.net>                         |
   +----------------------------------------------------------------------+
*/

/* $Id$ */

/*
 * HTML entity resources:
 *
 * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
 *
 * XHTML 1.0 DTD
 * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
 *
 * From HTML 4.01 strict DTD:
 * http://www.w3.org/TR/html4/HTMLlat1.ent
 * http://www.w3.org/TR/html4/HTMLsymbol.ent
 * http://www.w3.org/TR/html4/HTMLspecial.ent
 *
 * HTML 5:
 * http://dev.w3.org/html5/spec/Overview.html#named-character-references
 */

#include "php.h"
#if PHP_WIN32
#include "config.w32.h"
#else
#include <php_config.h>
#endif
#include "php_standard.h"
#include "php_string.h"
#include "SAPI.h"
#if HAVE_LOCALE_H
#include <locale.h>
#endif
#if HAVE_LANGINFO_H
#include <langinfo.h>
#endif

#include <zend_hash.h>
#include "html_tables.h"

/* Macro for disabling flag of translation of non-basic entities where this isn't supported.
 * Not appropriate for html_entity_decode/htmlspecialchars_decode */
#define LIMIT_ALL(all, doctype, charset) do { \
	(all) = (all) && !CHARSET_PARTIAL_SUPPORT((charset)) && ((doctype) != ENT_HTML_DOC_XML1); \
} while (0)

#define MB_FAILURE(pos, advance) do { \
	*cursor = pos + (advance); \
	*status = FAILURE; \
	return 0; \
} while (0)

#define CHECK_LEN(pos, chars_need) ((str_len - (pos)) >= (chars_need))

/* valid as single byte character or leading byte */
#define utf8_lead(c)  ((c) < 0x80 || ((c) >= 0xC2 && (c) <= 0xF4))
/* whether it's actually valid depends on other stuff;
 * this macro cannot check for non-shortest forms, surrogates or
 * code points above 0x10FFFF */
#define utf8_trail(c) ((c) >= 0x80 && (c) <= 0xBF)

#define gb2312_lead(c) ((c) != 0x8E && (c) != 0x8F && (c) != 0xA0 && (c) != 0xFF)
#define gb2312_trail(c) ((c) >= 0xA1 && (c) <= 0xFE)

#define sjis_lead(c) ((c) != 0x80 && (c) != 0xA0 && (c) < 0xFD)
#define sjis_trail(c) ((c) >= 0x40  && (c) != 0x7F && (c) < 0xFD)

/* {{{ get_default_charset
 */
static char *get_default_charset(TSRMLS_D) {
	if (PG(internal_encoding) && PG(internal_encoding)[0]) {
		return PG(internal_encoding);
	} else if (SG(default_charset) && SG(default_charset)[0] ) {
		return SG(default_charset);
	}
	return NULL;
}
/* }}} */

/* {{{ get_next_char
 */
static inline unsigned int get_next_char(
		enum entity_charset charset,
		const unsigned char *str,
		size_t str_len,
		size_t *cursor,
		int *status)
{
	size_t pos = *cursor;
	unsigned int this_char = 0;

	*status = SUCCESS;
	assert(pos <= str_len);

	if (!CHECK_LEN(pos, 1))
		MB_FAILURE(pos, 1);

	switch (charset) {
	case cs_utf_8:
		{
			/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
			 * "In a reported illegal byte sequence, do not include any
			 *  non-initial byte that encodes a valid character or is a leading
			 *  byte for a valid sequence." */
			unsigned char c;
			c = str[pos];
			if (c < 0x80) {
				this_char = c;
				pos++;
			} else if (c < 0xc2) {
				MB_FAILURE(pos, 1);
			} else if (c < 0xe0) {
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				if (!utf8_trail(str[pos + 1])) {
					MB_FAILURE(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
				}
				this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
				if (this_char < 0x80) { /* non-shortest form */
					MB_FAILURE(pos, 2);
				}
				pos += 2;
			} else if (c < 0xf0) {
				size_t avail = str_len - pos;

				if (avail < 3 ||
						!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2])) {
					if (avail < 2 || utf8_lead(str[pos + 1]))
						MB_FAILURE(pos, 1);
					else if (avail < 3 || utf8_lead(str[pos + 2]))
						MB_FAILURE(pos, 2);
					else
						MB_FAILURE(pos, 3);
				}

				this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
				if (this_char < 0x800) { /* non-shortest form */
					MB_FAILURE(pos, 3);
				} else if (this_char >= 0xd800 && this_char <= 0xdfff) { /* surrogate */
					MB_FAILURE(pos, 3);
				}
				pos += 3;
			} else if (c < 0xf5) {
				size_t avail = str_len - pos;

				if (avail < 4 ||
						!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
						!utf8_trail(str[pos + 3])) {
					if (avail < 2 || utf8_lead(str[pos + 1]))
						MB_FAILURE(pos, 1);
					else if (avail < 3 || utf8_lead(str[pos + 2]))
						MB_FAILURE(pos, 2);
					else if (avail < 4 || utf8_lead(str[pos + 3]))
						MB_FAILURE(pos, 3);
					else
						MB_FAILURE(pos, 4);
				}

				this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
				if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */
					MB_FAILURE(pos, 4);
				}
				pos += 4;
			} else {
				MB_FAILURE(pos, 1);
			}
		}
		break;

	case cs_big5:
		/* reference http://demo.icu-project.org/icu-bin/convexp?conv=big5 */
		{
			unsigned char c = str[pos];
			if (c >= 0x81 && c <= 0xFE) {
				unsigned char next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				next = str[pos + 1];

				if ((next >= 0x40 && next <= 0x7E) ||
						(next >= 0xA1 && next <= 0xFE)) {
					this_char = (c << 8) | next;
				} else {
					MB_FAILURE(pos, 1);
				}
				pos += 2;
			} else {
				this_char = c;
				pos += 1;
			}
		}
		break;

	case cs_big5hkscs:
		{
			unsigned char c = str[pos];
			if (c >= 0x81 && c <= 0xFE) {
				unsigned char next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				next = str[pos + 1];

				if ((next >= 0x40 && next <= 0x7E) ||
						(next >= 0xA1 && next <= 0xFE)) {
					this_char = (c << 8) | next;
				} else if (next != 0x80 && next != 0xFF) {
					MB_FAILURE(pos, 1);
				} else {
					MB_FAILURE(pos, 2);
				}
				pos += 2;
			} else {
				this_char = c;
				pos += 1;
			}
		}
		break;

	case cs_gb2312: /* EUC-CN */
		{
			unsigned char c = str[pos];
			if (c >= 0xA1 && c <= 0xFE) {
				unsigned char next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				next = str[pos + 1];

				if (gb2312_trail(next)) {
					this_char = (c << 8) | next;
				} else if (gb2312_lead(next)) {
					MB_FAILURE(pos, 1);
				} else {
					MB_FAILURE(pos, 2);
				}
				pos += 2;
			} else if (gb2312_lead(c)) {
				this_char = c;
				pos += 1;
			} else {
				MB_FAILURE(pos, 1);
			}
		}
		break;

	case cs_sjis:
		{
			unsigned char c = str[pos];
			if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xFC)) {
				unsigned char next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				next = str[pos + 1];

				if (sjis_trail(next)) {
					this_char = (c << 8) | next;
				} else if (sjis_lead(next)) {
					MB_FAILURE(pos, 1);
				} else {
					MB_FAILURE(pos, 2);
				}
				pos += 2;
			} else if (c < 0x80 || (c >= 0xA1 && c <= 0xDF)) {
				this_char = c;
				pos += 1;
			} else {
				MB_FAILURE(pos, 1);
			}
		}
		break;

	case cs_eucjp:
		{
			unsigned char c = str[pos];

			if (c >= 0xA1 && c <= 0xFE) {
				unsigned next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);
				next = str[pos + 1];

				if (next >= 0xA1 && next <= 0xFE) {
					/* this a jis kanji char */
					this_char = (c << 8) | next;
				} else {
					MB_FAILURE(pos, (next != 0xA0 && next != 0xFF) ? 1 : 2);
				}
				pos += 2;
			} else if (c == 0x8E) {
				unsigned next;
				if (!CHECK_LEN(pos, 2))
					MB_FAILURE(pos, 1);

				next = str[pos + 1];
				if (next >= 0xA1 && next <= 0xDF) {
					/* JIS X 0201 kana */
					this_char = (c << 8) | next;
				} else {
					MB_FAILURE(pos, (next != 0xA0 && next != 0xFF) ? 1 : 2);
				}
				pos += 2;
			} else if (c == 0x8F) {
				size_t avail = str_len - pos;

				if (avail < 3 || !(str[pos + 1] >= 0xA1 && str[pos + 1] <= 0xFE) ||
						!(str[pos + 2] >= 0xA1 && str[pos + 2] <= 0xFE)) {
					if (avail < 2 || (str[pos + 1] != 0xA0 && str[pos + 1] != 0xFF))
						MB_FAILURE(pos, 1);
					else if (avail < 3 || (str[pos + 2] != 0xA0 && str[pos + 2] != 0xFF))
						MB_FAILURE(pos, 2);
					else
						MB_FAILURE(pos, 3);
				} else {
					/* JIS X 0212 hojo-kanji */
					this_char = (c << 16) | (str[pos + 1] << 8) | str[pos + 2];
				}
				pos += 3;
			} else if (c != 0xA0 && c != 0xFF) {
				/* character encoded in 1 code unit */
				this_char = c;
				pos += 1;
			} else {
				MB_FAILURE(pos, 1);
			}
		}
		break;
	default:
		/* single-byte charsets */
		this_char = str[pos++];
		break;
	}

	*cursor = pos;
  	return this_char;
}
/* }}} */

/* {{{ php_next_utf8_char
 * Public interface for get_next_char used with UTF-8 */
 PHPAPI unsigned int php_next_utf8_char(
		const unsigned char *str,
		size_t str_len,
		size_t *cursor,
		int *status)
{
	return get_next_char(cs_utf_8, str, str_len, cursor, status);
}
/* }}} */

/* {{{ entity_charset determine_charset
 * returns the charset identifier based on current locale or a hint.
 * defaults to UTF-8 */
static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
{
	int i;
	enum entity_charset charset = cs_utf_8;
	int len = 0;
	const zend_encoding *zenc;

	/* Default is now UTF-8 */
	if (charset_hint == NULL)
		return cs_utf_8;

	if ((len = strlen(charset_hint)) != 0) {
		goto det_charset;
	}

	zenc = zend_multibyte_get_internal_encoding(TSRMLS_C);
	if (zenc != NULL) {
		charset_hint = (char *)zend_multibyte_get_encoding_name(zenc);
		if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
			if ((len == 4) /* sizeof (none|auto|pass) */ &&
					(!memcmp("pass", charset_hint, 4) ||
					 !memcmp("auto", charset_hint, 4) ||
					 !memcmp("none", charset_hint, 4))) {
				charset_hint = NULL;
				len = 0;
			} else {
				goto det_charset;
			}
		}
	}

	charset_hint = SG(default_charset);
	if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
		goto det_charset;
	}

	/* try to detect the charset for the locale */
#if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
	charset_hint = nl_langinfo(CODESET);
	if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
		goto det_charset;
	}
#endif

#if HAVE_LOCALE_H
	/* try to figure out the charset from the locale */
	{
		char *localename;
		char *dot, *at;

		/* lang[_territory][.codeset][@modifier] */
		localename = setlocale(LC_CTYPE, NULL);

		dot = strchr(localename, '.');
		if (dot) {
			dot++;
			/* locale specifies a codeset */
			at = strchr(dot, '@');
			if (at)
				len = at - dot;
			else
				len = strlen(dot);
			charset_hint = dot;
		} else {
			/* no explicit name; see if the name itself
			 * is the charset */
			charset_hint = localename;
			len = strlen(charset_hint);
		}
	}
#endif

det_charset:

	if (charset_hint) {
		int found = 0;

		/* now walk the charset map and look for the codeset */
		for (i = 0; charset_map[i].codeset; i++) {
			if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
				charset = charset_map[i].charset;
				found = 1;
				break;
			}
		}
		if (!found) {
			php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming utf-8",
					charset_hint);
		}
	}
	return charset;
}
/* }}} */

/* {{{ php_utf32_utf8 */
static inline size_t php_utf32_utf8(unsigned char *buf, unsigned k)
{
	size_t retval = 0;

	/* assert(0x0 <= k <= 0x10FFFF); */

	if (k < 0x80) {
		buf[0] = k;
		retval = 1;
	} else if (k < 0x800) {
		buf[0] = 0xc0 | (k >> 6);
		buf[1] = 0x80 | (k & 0x3f);
		retval = 2;
	} else if (k < 0x10000) {
		buf[0] = 0xe0 | (k >> 12);
		buf[1] = 0x80 | ((k >> 6) & 0x3f);
		buf[2] = 0x80 | (k & 0x3f);
		retval = 3;
	} else {
		buf[0] = 0xf0 | (k >> 18);
		buf[1] = 0x80 | ((k >> 12) & 0x3f);
		buf[2] = 0x80 | ((k >> 6) & 0x3f);
		buf[3] = 0x80 | (k & 0x3f);
		retval = 4;
	}
	/* UTF-8 has been restricted to max 4 bytes since RFC 3629 */

	return retval;
}
/* }}} */

/* {{{ php_mb2_int_to_char
 * Convert back big endian int representation of sequence of one or two 8-bit code units. */
static inline size_t php_mb2_int_to_char(unsigned char *buf, unsigned k)
{
	assert(k <= 0xFFFFU);
	/* one or two bytes */
	if (k <= 0xFFU) { /* 1 */
		buf[0] = k;
		return 1U;
	} else { /* 2 */
		buf[0] = k >> 8;
		buf[1] = k & 0xFFU;
		return 2U;
	}
}
/* }}} */

/* {{{ php_mb3_int_to_char
 * Convert back big endian int representation of sequence of one to three 8-bit code units.
 * For EUC-JP. */
static inline size_t php_mb3_int_to_char(unsigned char *buf, unsigned k)
{
	assert(k <= 0xFFFFFFU);
	/* one to three bytes */
	if (k <= 0xFFU) { /* 1 */
		buf[0] = k;
		return 1U;
	} else if (k <= 0xFFFFU) { /* 2 */
		buf[0] = k >> 8;
		buf[1] = k & 0xFFU;
		return 2U;
	} else {
		buf[0] = k >> 16;
		buf[1] = (k >> 8) & 0xFFU;
		buf[2] = k & 0xFFU;
		return 3U;
	}
}
/* }}} */


/* {{{ unimap_bsearc_cmp
 * Binary search of unicode code points in unicode <--> charset mapping.
 * Returns the code point in the target charset (whose mapping table was given) or 0 if
 * the unicode code point is not in the table.
 */
static inline unsigned char unimap_bsearch(const uni_to_enc *table, unsigned code_key_a, size_t num)
{
	const uni_to_enc *l = table,
					 *h = &table[num-1],
					 *m;
	unsigned short code_key;

	/* we have no mappings outside the BMP */
	if (code_key_a > 0xFFFFU)
		return 0;

	code_key = (unsigned short) code_key_a;

	while (l <= h) {
		m = l + (h - l) / 2;
		if (code_key < m->un_code_point)
			h = m - 1;
		else if (code_key > m->un_code_point)
			l = m + 1;
		else
			return m->cs_code;
	}
	return 0;
}
/* }}} */

/* {{{ map_from_unicode */
static inline int map_from_unicode(unsigned code, enum entity_charset charset, unsigned *res)
{
	unsigned char found;
	const uni_to_enc *table;
	size_t table_size;

	switch (charset) {
	case cs_8859_1:
		/* identity mapping of code points to unicode */
		if (code > 0xFF) {
			return FAILURE;
		}
		*res = code;
		break;

	case cs_8859_5:
		if (code <= 0xA0 || code == 0xAD /* soft hyphen */) {
			*res = code;
		} else if (code == 0x2116) {
			*res = 0xF0; /* numero sign */
		} else if (code == 0xA7) {
			*res = 0xFD; /* section sign */
		} else if (code >= 0x0401 && code <= 0x044F) {
			if (code == 0x040D || code == 0x0450 || code == 0x045D)
				return FAILURE;
			*res = code - 0x360;
		} else {
			return FAILURE;
		}
		break;

	case cs_8859_15:
		if (code < 0xA4 || (code > 0xBE && code <= 0xFF)) {
			*res = code;
		} else { /* between A4 and 0xBE */
			found = unimap_bsearch(unimap_iso885915,
				code, sizeof(unimap_iso885915) / sizeof(*unimap_iso885915));
			if (found)
				*res = found;
			else
				return FAILURE;
		}
		break;

	case cs_cp1252:
		if (code <= 0x7F || (code >= 0xA0 && code <= 0xFF)) {
			*res = code;
		} else {
			found = unimap_bsearch(unimap_win1252,
				code, sizeof(unimap_win1252) / sizeof(*unimap_win1252));
			if (found)
				*res = found;
			else
				return FAILURE;
		}
		break;

	case cs_macroman:
		if (code == 0x7F)
			return FAILURE;
		table = unimap_macroman;
		table_size = sizeof(unimap_macroman) / sizeof(*unimap_macroman);
		goto table_over_7F;
	case cs_cp1251:
		table = unimap_win1251;
		table_size = sizeof(unimap_win1251) / sizeof(*unimap_win1251);
		goto table_over_7F;
	case cs_koi8r:
		table = unimap_koi8r;
		table_size = sizeof(unimap_koi8r) / sizeof(*unimap_koi8r);
		goto table_over_7F;
	case cs_cp866:
		table = unimap_cp866;
		table_size = sizeof(unimap_cp866) / sizeof(*unimap_cp866);

table_over_7F:
		if (code <= 0x7F) {
			*res = code;
		} else {
			found = unimap_bsearch(table, code, table_size);
			if (found)
				*res = found;
			else
				return FAILURE;
		}
		break;

	/* from here on, only map the possible characters in the ASCII range.
	 * to improve support here, it's a matter of building the unicode mappings.
	 * See <http://www.unicode.org/Public/6.0.0/ucd/Unihan.zip> */
	case cs_sjis:
	case cs_eucjp:
		/* we interpret 0x5C as the Yen symbol. This is not universal.
		 * See <http://www.w3.org/Submission/japanese-xml/#ambiguity_of_yen> */
		if (code >= 0x20 && code <= 0x7D) {
			if (code == 0x5C)
				return FAILURE;
			*res = code;
		} else {
			return FAILURE;
		}
		break;

	case cs_big5:
	case cs_big5hkscs:
	case cs_gb2312:
		if (code >= 0x20 && code <= 0x7D) {
			*res = code;
		} else {
			return FAILURE;
		}
		break;

	default:
		return FAILURE;
	}

	return SUCCESS;
}
/* }}} */

/* {{{ */
static inline void map_to_unicode(unsigned code, const enc_to_uni *table, unsigned *res)
{
	/* only single byte encodings are currently supported; assumed code <= 0xFF */
	*res = table->inner[ENT_ENC_TO_UNI_STAGE1(code)]->uni_cp[ENT_ENC_TO_UNI_STAGE2(code)];
}
/* }}} */

/* {{{ unicode_cp_is_allowed */
static inline int unicode_cp_is_allowed(unsigned uni_cp, int document_type)
{
	/* XML 1.0				HTML 4.01			HTML 5
	 * 0x09..0x0A			0x09..0x0A			0x09..0x0A
	 * 0x0D					0x0D				0x0C..0x0D
	 * 0x0020..0xD7FF		0x20..0x7E			0x20..0x7E
	 *						0x00A0..0xD7FF		0x00A0..0xD7FF
	 * 0xE000..0xFFFD		0xE000..0x10FFFF	0xE000..0xFDCF
	 * 0x010000..0x10FFFF						0xFDF0..0x10FFFF (*)
	 *
	 * (*) exclude code points where ((code & 0xFFFF) >= 0xFFFE)
	 *
	 * References:
	 * XML 1.0:   <http://www.w3.org/TR/REC-xml/#charsets>
	 * HTML 4.01: <http://www.w3.org/TR/1999/PR-html40-19990824/sgml/sgmldecl.html>
	 * HTML 5:    <http://dev.w3.org/html5/spec/Overview.html#preprocessing-the-input-stream>
	 *
	 * Not sure this is the relevant part for HTML 5, though. I opted to
	 * disallow the characters that would result in a parse error when
	 * preprocessing of the input stream. See also section 8.1.3.
	 *
	 * It's unclear if XHTML 1.0 allows C1 characters. I'll opt to apply to
	 * XHTML 1.0 the same rules as for XML 1.0.
	 * See <http://cmsmcq.com/2007/C1.xml>.
	 */

	switch (document_type) {
	case ENT_HTML_DOC_HTML401:
		return (uni_cp >= 0x20 && uni_cp <= 0x7E) ||
			(uni_cp == 0x0A || uni_cp == 0x09 || uni_cp == 0x0D) ||
			(uni_cp >= 0xA0 && uni_cp <= 0xD7FF) ||
			(uni_cp >= 0xE000 && uni_cp <= 0x10FFFF);
	case ENT_HTML_DOC_HTML5:
		return (uni_cp >= 0x20 && uni_cp <= 0x7E) ||
			(uni_cp >= 0x09 && uni_cp <= 0x0D && uni_cp != 0x0B) || /* form feed U+0C allowed */
			(uni_cp >= 0xA0 && uni_cp <= 0xD7FF) ||
			(uni_cp >= 0xE000 && uni_cp <= 0x10FFFF &&
				((uni_cp & 0xFFFF) < 0xFFFE) && /* last two of each plane (nonchars) disallowed */
				(uni_cp < 0xFDD0 || uni_cp > 0xFDEF)); /* U+FDD0-U+FDEF (nonchars) disallowed */
	case ENT_HTML_DOC_XHTML:
	case ENT_HTML_DOC_XML1:
		return (uni_cp >= 0x20 && uni_cp <= 0xD7FF) ||
			(uni_cp == 0x0A || uni_cp == 0x09 || uni_cp == 0x0D) ||
			(uni_cp >= 0xE000 && uni_cp <= 0x10FFFF && uni_cp != 0xFFFE && uni_cp != 0xFFFF);
	default:
		return 1;
	}
}
/* }}} */

/* {{{ unicode_cp_is_allowed */
static inline int numeric_entity_is_allowed(unsigned uni_cp, int document_type)
{
	/* less restrictive than unicode_cp_is_allowed */
	switch (document_type) {
	case ENT_HTML_DOC_HTML401:
		/* all non-SGML characters (those marked with UNUSED in DESCSET) should be
		 * representable with numeric entities */
		return uni_cp <= 0x10FFFF;
	case ENT_HTML_DOC_HTML5:
		/* 8.1.4. The numeric character reference forms described above are allowed to
		 * reference any Unicode code point other than U+0000, U+000D, permanently
		 * undefined Unicode characters (noncharacters), and control characters other
		 * than space characters (U+0009, U+000A, U+000C and U+000D) */
		/* seems to allow surrogate characters, then */
		return (uni_cp >= 0x20 && uni_cp <= 0x7E) ||
			(uni_cp >= 0x09 && uni_cp <= 0x0C && uni_cp != 0x0B) || /* form feed U+0C allowed, but not U+0D */
			(uni_cp >= 0xA0 && uni_cp <= 0x10FFFF &&
				((uni_cp & 0xFFFF) < 0xFFFE) && /* last two of each plane (nonchars) disallowed */
				(uni_cp < 0xFDD0 || uni_cp > 0xFDEF)); /* U+FDD0-U+FDEF (nonchars) disallowed */
	case ENT_HTML_DOC_XHTML:
	case ENT_HTML_DOC_XML1:
		/* OTOH, XML 1.0 requires "character references to match the production for Char
		 * See <http://www.w3.org/TR/REC-xml/#NT-CharRef> */
		return unicode_cp_is_allowed(uni_cp, document_type);
	default:
		return 1;
	}
}
/* }}} */

/* {{{ process_numeric_entity
 * Auxiliary function to traverse_for_entities.
 * On input, *buf should point to the first character after # and on output, it's the last
 * byte read, no matter if there was success or insuccess.
 */
static inline int process_numeric_entity(const char **buf, unsigned *code_point)
{
	long code_l;
	int hexadecimal = (**buf == 'x' || **buf == 'X'); /* TODO: XML apparently disallows "X" */
	char *endptr;

	if (hexadecimal && (**buf != '\0'))
		(*buf)++;

	/* strtol allows whitespace and other stuff in the beginning
		* we're not interested */
	if ((hexadecimal && !isxdigit(**buf)) ||
			(!hexadecimal && !isdigit(**buf))) {
		return FAILURE;
	}

	code_l = strtol(*buf, &endptr, hexadecimal ? 16 : 10);
	/* we're guaranteed there were valid digits, so *endptr > buf */
	*buf = endptr;

	if (**buf != ';')
		return FAILURE;

	/* many more are invalid, but that depends on whether it's HTML
	 * (and which version) or XML. */
	if (code_l > 0x10FFFFL)
		return FAILURE;

	if (code_point != NULL)
		*code_point = (unsigned)code_l;

	return SUCCESS;
}
/* }}} */

/* {{{ process_named_entity */
static inline int process_named_entity_html(const char **buf, const char **start, size_t *length)
{
	*start = *buf;

	/* "&" is represented by a 0x26 in all supported encodings. That means
	 * the byte after represents a character or is the leading byte of an
	 * sequence of 8-bit code units. If in the ranges below, it represents
	 * necessarily a alpha character because none of the supported encodings
	 * has an overlap with ASCII in the leading byte (only on the second one) */
	while ((**buf >= 'a' && **buf <= 'z') ||
			(**buf >= 'A' && **buf <= 'Z') ||
			(**buf >= '0' && **buf <= '9')) {
		(*buf)++;
	}

	if (**buf != ';')
		return FAILURE;

	/* cast to size_t OK as the quantity is always non-negative */
	*length = *buf - *start;

	if (*length == 0)
		return FAILURE;

	return SUCCESS;
}
/* }}} */

/* {{{ resolve_named_entity_html */
static inline int resolve_named_entity_html(const char *start, size_t length, const entity_ht *ht, unsigned *uni_cp1, unsigned *uni_cp2)
{
	const entity_cp_map *s;
	ulong hash = zend_inline_hash_func(start, length);

	s = ht->buckets[hash % ht->num_elems];
	while (s->entity) {
		if (s->entity_len == length) {
			if (memcmp(start, s->entity, length) == 0) {
				*uni_cp1 = s->codepoint1;
				*uni_cp2 = s->codepoint2;
				return SUCCESS;
			}
		}
		s++;
	}
	return FAILURE;
}
/* }}} */

static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charset charset, unsigned code) {
	/* code is not necessarily a unicode code point */
	switch (charset) {
	case cs_utf_8:
		return php_utf32_utf8(buf, code);

	case cs_8859_1:
	case cs_cp1252:
	case cs_8859_15:
	case cs_koi8r:
	case cs_cp1251:
	case cs_8859_5:
	case cs_cp866:
	case cs_macroman:
		/* single byte stuff */
		*buf = code;
		return 1;

	case cs_big5:
	case cs_big5hkscs:
	case cs_sjis:
	case cs_gb2312:
		/* we don't have complete unicode mappings for these yet in entity_decode,
		 * and we opt to pass through the octet sequences for these in htmlentities
		 * instead of converting to an int and then converting back. */
#if 0
		return php_mb2_int_to_char(buf, code);
#else
#if ZEND_DEBUG
		assert(code <= 0xFFU);
#endif
		*buf = code;
		return 1;
#endif

	case cs_eucjp:
#if 0 /* idem */
		return php_mb2_int_to_char(buf, code);
#else
#if ZEND_DEBUG
		assert(code <= 0xFFU);
#endif
		*buf = code;
		return 1;
#endif

	default:
		assert(0);
		return 0;
	}
}

/* {{{ traverse_for_entities
 * Auxiliary function to php_unescape_html_entities().
 * - The argument "all" determines if all numeric entities are decode or only those
 *   that correspond to quotes (depending on quote_style).
 */
/* maximum expansion (factor 1.2) for HTML 5 with &nGt; and &nLt; */
/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen) ((oldlen) + (oldlen) / 5 + 2)
static void traverse_for_entities(
	const char *old,
	size_t oldlen,
	char *ret, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
	size_t *retlen,
	int all,
	int flags,
	const entity_ht *inv_map,
	enum entity_charset charset)
{
	const char *p,
			   *lim;
	char	   *q;
	int doctype = flags & ENT_HTML_DOC_TYPE_MASK;

	lim = old + oldlen; /* terminator address */
	assert(*lim == '\0');

	for (p = old, q = ret; p < lim;) {
		unsigned code, code2 = 0;
		const char *next = NULL; /* when set, next > p, otherwise possible inf loop */

		/* Shift JIS, Big5 and HKSCS use multi-byte encodings where an
		 * ASCII range byte can be part of a multi-byte sequence.
		 * However, they start at 0x40, therefore if we find a 0x26 byte,
		 * we're sure it represents the '&' character. */

		/* assumes there are no single-char entities */
		if (p[0] != '&' || (p + 3 >= lim)) {
			*(q++) = *(p++);
			continue;
		}

		/* now p[3] is surely valid and is no terminator */

		/* numerical entity */
		if (p[1] == '#') {
			next = &p[2];
			if (process_numeric_entity(&next, &code) == FAILURE)
				goto invalid_code;

			/* If we're in htmlspecialchars_decode, we're only decoding entities
			 * that represent &, <, >, " and '. Is this one of them? */
			if (!all && (code > 63U ||
					stage3_table_be_apos_00000[code].data.ent.entity == NULL))
				goto invalid_code;

			/* are we allowed to decode this entity in this document type?
			 * HTML 5 is the only that has a character that cannot be used in
			 * a numeric entity but is allowed literally (U+000D). The
			 * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
			if (!unicode_cp_is_allowed(code, doctype) ||
					(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))
				goto invalid_code;
		} else {
			const char *start;
			size_t ent_len;

			next = &p[1];
			start = next;

			if (process_named_entity_html(&next, &start, &ent_len) == FAILURE)
				goto invalid_code;

			if (resolve_named_entity_html(start, ent_len, inv_map, &code, &code2) == FAILURE) {
				if (doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start[0] == 'a'
							&& start[1] == 'p' && start[2] == 'o' && start[3] == 's') {
					/* uses html4 inv_map, which doesn't include apos;. This is a
					 * hack to support it */
					code = (unsigned) '\'';
				} else {
					goto invalid_code;
				}
			}
		}

		assert(*next == ';');

		if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
				(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))
				/* && code2 == '\0' always true for current maps */)
			goto invalid_code;

		/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
		 * the call is needed to ensure the codepoint <= U+00FF)  */
		if (charset != cs_utf_8) {
			/* replace unicode code point */
			if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
				goto invalid_code; /* not representable in target charset */
		}

		q += write_octet_sequence(q, charset, code);
		if (code2) {
			q += write_octet_sequence(q, charset, code2);
		}

		/* jump over the valid entity; may go beyond size of buffer; np */
		p = next + 1;
		continue;

invalid_code:
		for (; p < next; p++) {
			*(q++) = *p;
		}
	}

	*q = '\0';
	*retlen = (size_t)(q - ret);
}
/* }}} */

/* {{{ unescape_inverse_map */
static const entity_ht *unescape_inverse_map(int all, int flags)
{
	int document_type = flags & ENT_HTML_DOC_TYPE_MASK;

	if (all) {
		switch (document_type) {
		case ENT_HTML_DOC_HTML401:
		case ENT_HTML_DOC_XHTML: /* but watch out for &apos;...*/
			return &ent_ht_html4;
		case ENT_HTML_DOC_HTML5:
			return &ent_ht_html5;
		default:
			return &ent_ht_be_apos;
		}
	} else {
		switch (document_type) {
		case ENT_HTML_DOC_HTML401:
			return &ent_ht_be_noapos;
		default:
			return &ent_ht_be_apos;
		}
	}
}
/* }}} */

/* {{{ determine_entity_table
 * Entity table to use. Note that entity tables are defined in terms of
 * unicode code points */
static entity_table_opt determine_entity_table(int all, int doctype)
{
	entity_table_opt retval = {NULL};

	assert(!(doctype == ENT_HTML_DOC_XML1 && all));

	if (all) {
		retval.ms_table = (doctype == ENT_HTML_DOC_HTML5) ?
			entity_ms_table_html5 : entity_ms_table_html4;
	} else {
		retval.table = (doctype == ENT_HTML_DOC_HTML401) ?
			stage3_table_be_noapos_00000 : stage3_table_be_apos_00000;
	}
	return retval;
}
/* }}} */

/* {{{ php_unescape_html_entities
 * The parameter "all" should be true to decode all possible entities, false to decode
 * only the basic ones, i.e., those in basic_entities_ex + the numeric entities
 * that correspond to quotes.
 */
PHPAPI char *php_unescape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC)
{
	size_t retlen;
	char *ret;
	enum entity_charset charset;
	const entity_ht *inverse_map = NULL;
	size_t new_size = TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen);

	if (all) {
		charset = determine_charset(hint_charset TSRMLS_CC);
	} else {
		charset = cs_8859_1; /* charset shouldn't matter, use ISO-8859-1 for performance */
	}

	/* don't use LIMIT_ALL! */

	if (oldlen > new_size) {
		/* overflow, refuse to do anything */
		ret = estrndup((char*)old, oldlen);
		retlen = oldlen;
		goto empty_source;
	}
	ret = emalloc(new_size);
	*ret = '\0';
	retlen = oldlen;
	if (retlen == 0) {
		goto empty_source;
	}

	inverse_map = unescape_inverse_map(all, flags);

	/* replace numeric entities */
	traverse_for_entities(old, oldlen, ret, &retlen, all, flags, inverse_map, charset);

empty_source:
	*newlen = retlen;
	return ret;
}
/* }}} */

PHPAPI char *php_escape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC)
{
	return php_escape_html_entities_ex(old, oldlen, newlen, all, flags, hint_charset, 1 TSRMLS_CC);
}

/* {{{ find_entity_for_char */
static inline void find_entity_for_char(
	unsigned int k,
	enum entity_charset charset,
	const entity_stage1_row *table,
	const unsigned char **entity,
	size_t *entity_len,
	unsigned char *old,
	size_t oldlen,
	size_t *cursor)
{
	unsigned stage1_idx = ENT_STAGE1_INDEX(k);
	const entity_stage3_row *c;

	if (stage1_idx > 0x1D) {
		*entity     = NULL;
		*entity_len = 0;
		return;
	}

	c = &table[stage1_idx][ENT_STAGE2_INDEX(k)][ENT_STAGE3_INDEX(k)];

	if (!c->ambiguous) {
		*entity     = (const unsigned char *)c->data.ent.entity;
		*entity_len = c->data.ent.entity_len;
	} else {
		/* peek at next char */
		size_t	 cursor_before	= *cursor;
		int		 status			= SUCCESS;
		unsigned next_char;

		if (!(*cursor < oldlen))
			goto no_suitable_2nd;

		next_char = get_next_char(charset, old, oldlen, cursor, &status);

		if (status == FAILURE)
			goto no_suitable_2nd;

		{
			const entity_multicodepoint_row *s, *e;

			s = &c->data.multicodepoint_table[1];
			e = s - 1 + c->data.multicodepoint_table[0].leading_entry.size;
			/* we could do a binary search but it's not worth it since we have
			 * at most two entries... */
			for ( ; s <= e; s++) {
				if (s->normal_entry.second_cp == next_char) {
					*entity     = s->normal_entry.entity;
					*entity_len = s->normal_entry.entity_len;
					return;
				}
			}
		}
no_suitable_2nd:
		*cursor = cursor_before;
		*entity = (const unsigned char *)
			c->data.multicodepoint_table[0].leading_entry.default_entity;
		*entity_len = c->data.multicodepoint_table[0].leading_entry.default_entity_len;
	}
}
/* }}} */

/* {{{ find_entity_for_char_basic */
static inline void find_entity_for_char_basic(
	unsigned int k,
	const entity_stage3_row *table,
	const unsigned char **entity,
	size_t *entity_len)
{
	if (k >= 64U) {
		*entity     = NULL;
		*entity_len = 0;
		return;
	}

	*entity     = table[k].data.ent.entity;
	*entity_len = table[k].data.ent.entity_len;
}
/* }}} */

/* {{{ php_escape_html_entities
 */
PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset, zend_bool double_encode TSRMLS_DC)
{
	size_t cursor, maxlen, len;
	char *replaced;
	enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
	int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
	entity_table_opt entity_table;
	const enc_to_uni *to_uni_table = NULL;
	const entity_ht *inv_map = NULL; /* used for !double_encode */
	/* only used if flags includes ENT_HTML_IGNORE_ERRORS or ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS */
	const unsigned char *replacement = NULL;
	size_t replacement_len = 0;

	if (all) { /* replace with all named entities */
		if (CHARSET_PARTIAL_SUPPORT(charset)) {
			php_error_docref0(NULL TSRMLS_CC, E_STRICT, "Only basic entities "
				"substitution is supported for multi-byte encodings other than UTF-8; "
				"functionality is equivalent to htmlspecialchars");
		}
		LIMIT_ALL(all, doctype, charset);
	}
	entity_table = determine_entity_table(all, doctype);
	if (all && !CHARSET_UNICODE_COMPAT(charset)) {
		to_uni_table = enc_to_uni_index[charset];
	}

	if (!double_encode) {
		/* first arg is 1 because we want to identify valid named entities
		 * even if we are only encoding the basic ones */
		inv_map = unescape_inverse_map(1, flags);
	}

	if (flags & (ENT_HTML_SUBSTITUTE_ERRORS | ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS)) {
		if (charset == cs_utf_8) {
			replacement = (const unsigned char*)"\xEF\xBF\xBD";
			replacement_len = sizeof("\xEF\xBF\xBD") - 1;
		} else {
			replacement = (const unsigned char*)"&#xFFFD;";
			replacement_len = sizeof("&#xFFFD;") - 1;
		}
	}

	/* initial estimate */
	if (oldlen < 64) {
		maxlen = 128;
	} else {
		maxlen = 2 * oldlen;
		if (maxlen < oldlen) {
			zend_error_noreturn(E_ERROR, "Input string is too long");
			return NULL;
		}
	}

	replaced = emalloc(maxlen + 1); /* adding 1 is safe: maxlen is even */
	len = 0;
	cursor = 0;
	while (cursor < oldlen) {
		const unsigned char *mbsequence = NULL;
		size_t mbseqlen					= 0,
		       cursor_before			= cursor;
		int status						= SUCCESS;
		unsigned int this_char			= get_next_char(charset, old, oldlen, &cursor, &status);

		/* guarantee we have at least 40 bytes to write.
		 * In HTML5, entities may take up to 33 bytes */
		if (len > maxlen - 40) { /* maxlen can never be smaller than 128 */
			replaced = safe_erealloc(replaced, maxlen , 1, 128 + 1);
			maxlen += 128;
		}

		if (status == FAILURE) {
			/* invalid MB sequence */
			if (flags & ENT_HTML_IGNORE_ERRORS) {
				continue;
			} else if (flags & ENT_HTML_SUBSTITUTE_ERRORS) {
				memcpy(&replaced[len], replacement, replacement_len);
				len += replacement_len;
				continue;
			} else {
				efree(replaced);
				*newlen = 0;
				return STR_EMPTY_ALLOC();
			}
		} else { /* SUCCESS */
			mbsequence = &old[cursor_before];
			mbseqlen = cursor - cursor_before;
		}

		if (this_char != '&') { /* no entity on this position */
			const unsigned char *rep	= NULL;
			size_t				rep_len	= 0;

			if (((this_char == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
					(this_char == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
				goto pass_char_through;

			if (all) { /* false that CHARSET_PARTIAL_SUPPORT(charset) */
				if (to_uni_table != NULL) {
					/* !CHARSET_UNICODE_COMPAT therefore not UTF-8; since UTF-8
					 * is the only multibyte encoding with !CHARSET_PARTIAL_SUPPORT,
					 * we're using a single byte encoding */
					map_to_unicode(this_char, to_uni_table, &this_char);
					if (this_char == 0xFFFF) /* no mapping; pass through */
						goto pass_char_through;
				}
				/* the cursor may advance */
				find_entity_for_char(this_char, charset, entity_table.ms_table, &rep,
					&rep_len, old, oldlen, &cursor);
			} else {
				find_entity_for_char_basic(this_char, entity_table.table, &rep, &rep_len);
			}

			if (rep != NULL) {
				replaced[len++] = '&';
				memcpy(&replaced[len], rep, rep_len);
				len += rep_len;
				replaced[len++] = ';';
			} else {
				/* we did not find an entity for this char.
				 * check for its validity, if its valid pass it unchanged */
				if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) {
					if (CHARSET_UNICODE_COMPAT(charset)) {
						if (!unicode_cp_is_allowed(this_char, doctype)) {
							mbsequence = replacement;
							mbseqlen = replacement_len;
						}
					} else if (to_uni_table) {
						if (!all) /* otherwise we already did this */
							map_to_unicode(this_char, to_uni_table, &this_char);
						if (!unicode_cp_is_allowed(this_char, doctype)) {
							mbsequence = replacement;
							mbseqlen = replacement_len;
						}
					} else {
						/* not a unicode code point, unless, coincidentally, it's in
						 * the 0x20..0x7D range (except 0x5C in sjis). We know nothing
						 * about other code points, because we have no tables. Since
						 * Unicode code points in that range are not disallowed in any
						 * document type, we could do nothing. However, conversion
						 * tables frequently map 0x00-0x1F to the respective C0 code
						 * points. Let's play it safe and admit that's the case */
						if (this_char <= 0x7D &&
								!unicode_cp_is_allowed(this_char, doctype)) {
							mbsequence = replacement;
							mbseqlen = replacement_len;
						}
					}
				}
pass_char_through:
				if (mbseqlen > 1) {
					memcpy(replaced + len, mbsequence, mbseqlen);
					len += mbseqlen;
				} else {
					replaced[len++] = mbsequence[0];
				}
			}
		} else { /* this_char == '&' */
			if (double_encode) {
encode_amp:
				memcpy(&replaced[len], "&amp;", sizeof("&amp;") - 1);
				len += sizeof("&amp;") - 1;
			} else { /* no double encode */
				/* check if entity is valid */
				size_t ent_len; /* not counting & or ; */
				/* peek at next char */
				if (old[cursor] == '#') { /* numeric entity */
					unsigned code_point;
					int valid;
					char *pos = (char*)&old[cursor+1];
					valid = process_numeric_entity((const char **)&pos, &code_point);
					if (valid == FAILURE)
						goto encode_amp;
					if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) {
						if (!numeric_entity_is_allowed(code_point, doctype))
							goto encode_amp;
					}
					ent_len = pos - (char*)&old[cursor];
				} else { /* named entity */
					/* check for vality of named entity */
					const char *start = &old[cursor],
							   *next = start;
					unsigned   dummy1, dummy2;

					if (process_named_entity_html(&next, &start, &ent_len) == FAILURE)
						goto encode_amp;
					if (resolve_named_entity_html(start, ent_len, inv_map, &dummy1, &dummy2) == FAILURE) {
						if (!(doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start[0] == 'a'
									&& start[1] == 'p' && start[2] == 'o' && start[3] == 's')) {
							/* uses html4 inv_map, which doesn't include apos;. This is a
							 * hack to support it */
							goto encode_amp;
						}
					}
				}
				/* checks passed; copy entity to result */
				/* entity size is unbounded, we may need more memory */
				/* at this point maxlen - len >= 40 */
				if (maxlen - len < ent_len + 2 /* & and ; */) {
					/* ent_len < oldlen, which is certainly <= SIZE_MAX/2 */
					replaced = safe_erealloc(replaced, maxlen, 1, ent_len + 128 + 1);
					maxlen += ent_len + 128;
				}
				replaced[len++] = '&';
				memcpy(&replaced[len], &old[cursor], ent_len);
				len += ent_len;
				replaced[len++] = ';';
				cursor += ent_len + 1;
			}
		}
	}
	replaced[len] = '\0';
	*newlen = len;
	if(len > INT_MAX) {
		zend_error_noreturn(E_ERROR, "Escaped string is too long");
		efree(replaced);
		return NULL;
	}

	return replaced;
}
/* }}} */

/* {{{ php_html_entities
 */
static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
{
	char *str, *hint_charset = NULL;
	int str_len, hint_charset_len = 0;
	size_t new_len;
	long flags = ENT_COMPAT;
	char *replaced;
	zend_bool double_encode = 1;

	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, &flags, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
		return;
	}

	if (!hint_charset) {
		hint_charset = get_default_charset(TSRMLS_C);
	}
	replaced = php_escape_html_entities_ex(str, str_len, &new_len, all, (int) flags, hint_charset, double_encode TSRMLS_CC);

	RETVAL_STRINGL(replaced, (int)new_len, 0);
}
/* }}} */

#define HTML_SPECIALCHARS 	0
#define HTML_ENTITIES	 	1

/* {{{ register_html_constants
 */
void register_html_constants(INIT_FUNC_ARGS)
{
	REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("ENT_SUBSTITUTE", ENT_SUBSTITUTE, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("ENT_DISALLOWED", ENT_DISALLOWED, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("ENT_HTML401", ENT_HTML401, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("ENT_XML1", ENT_XML1, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("ENT_XHTML", ENT_XHTML, CONST_PERSISTENT|CONST_CS);
	REGISTER_LONG_CONSTANT("ENT_HTML5", ENT_HTML5, CONST_PERSISTENT|CONST_CS);
}
/* }}} */

/* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
   Convert special characters to HTML entities */
PHP_FUNCTION(htmlspecialchars)
{
	php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
}
/* }}} */

/* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
   Convert special HTML entities back to characters */
PHP_FUNCTION(htmlspecialchars_decode)
{
	char *str;
	int str_len;
	size_t new_len = 0;
	long quote_style = ENT_COMPAT;
	char *replaced;

	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len, &quote_style) == FAILURE) {
		return;
	}

	replaced = php_unescape_html_entities(str, str_len, &new_len, 0 /*!all*/, quote_style, NULL TSRMLS_CC);
	if (replaced) {
		RETURN_STRINGL(replaced, (int)new_len, 0);
	}
	RETURN_FALSE;
}
/* }}} */

/* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
   Convert all HTML entities to their applicable characters */
PHP_FUNCTION(html_entity_decode)
{
	char *str, *hint_charset = NULL;
	int str_len, hint_charset_len;
	size_t new_len = 0;
	long quote_style = ENT_COMPAT;
	char *replaced;

	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
							  &quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
		return;
	}

	if (!hint_charset) {
		hint_charset = get_default_charset(TSRMLS_C);
	}
	replaced = php_unescape_html_entities(str, str_len, &new_len, 1 /*all*/, quote_style, hint_charset TSRMLS_CC);

	if (replaced) {
		RETURN_STRINGL(replaced, (int)new_len, 0);
	}
	RETURN_FALSE;
}
/* }}} */


/* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
   Convert all applicable characters to HTML entities */
PHP_FUNCTION(htmlentities)
{
	php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
}
/* }}} */

/* {{{ write_s3row_data */
static inline void write_s3row_data(
	const entity_stage3_row *r,
	unsigned orig_cp,
	enum entity_charset charset,
	zval *arr)
{
	char key[9] = ""; /* two unicode code points in UTF-8 */
	char entity[LONGEST_ENTITY_LENGTH + 2] = {'&'};
	size_t written_k1;

	written_k1 = write_octet_sequence(key, charset, orig_cp);

	if (!r->ambiguous) {
		size_t l = r->data.ent.entity_len;
		memcpy(&entity[1], r->data.ent.entity, l);
		entity[l + 1] = ';';
		add_assoc_stringl_ex(arr, key, written_k1 + 1, entity, l + 2, 1);
	} else {
		unsigned i,
			     num_entries;
		const entity_multicodepoint_row *mcpr = r->data.multicodepoint_table;

		if (mcpr[0].leading_entry.default_entity != NULL) {
			size_t l = mcpr[0].leading_entry.default_entity_len;
			memcpy(&entity[1], mcpr[0].leading_entry.default_entity, l);
			entity[l + 1] = ';';
			add_assoc_stringl_ex(arr, key, written_k1 + 1, entity, l + 2, 1);
		}
		num_entries = mcpr[0].leading_entry.size;
		for (i = 1; i <= num_entries; i++) {
			size_t   l,
				     written_k2;
			unsigned uni_cp,
					 spe_cp;

			uni_cp = mcpr[i].normal_entry.second_cp;
			l = mcpr[i].normal_entry.entity_len;

			if (!CHARSET_UNICODE_COMPAT(charset)) {
				if (map_from_unicode(uni_cp, charset, &spe_cp) == FAILURE)
					continue; /* non representable in this charset */
			} else {
				spe_cp = uni_cp;
			}

			written_k2 = write_octet_sequence(&key[written_k1], charset, spe_cp);
			memcpy(&entity[1], mcpr[i].normal_entry.entity, l);
			entity[l + 1] = ';';
			entity[l + 1] = '\0';
			add_assoc_stringl_ex(arr, key, written_k1 + written_k2 + 1, entity, l + 1, 1);
		}
	}
}
/* }}} */

/* {{{ proto array get_html_translation_table([int table [, int flags [, string charset_hint]]])
   Returns the internal translation table used by htmlspecialchars and htmlentities */
PHP_FUNCTION(get_html_translation_table)
{
	long all = HTML_SPECIALCHARS,
		 flags = ENT_COMPAT;
	int doctype;
	entity_table_opt entity_table;
	const enc_to_uni *to_uni_table = NULL;
	char *charset_hint = NULL;
	int charset_hint_len;
	enum entity_charset charset;

	/* in this function we have to jump through some loops because we're
	 * getting the translated table from data structures that are optimized for
	 * random access, not traversal */

	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls",
			&all, &flags, &charset_hint, &charset_hint_len) == FAILURE) {
		return;
	}

	charset = determine_charset(charset_hint TSRMLS_CC);
	doctype = flags & ENT_HTML_DOC_TYPE_MASK;
	LIMIT_ALL(all, doctype, charset);

	array_init(return_value);

	entity_table = determine_entity_table(all, doctype);
	if (all && !CHARSET_UNICODE_COMPAT(charset)) {
		to_uni_table = enc_to_uni_index[charset];
	}

	if (all) { /* HTML_ENTITIES (actually, any non-zero value for 1st param) */
		const entity_stage1_row *ms_table = entity_table.ms_table;

		if (CHARSET_UNICODE_COMPAT(charset)) {
			unsigned i, j, k,
					 max_i, max_j, max_k;
			/* no mapping to unicode required */
			if (CHARSET_SINGLE_BYTE(charset)) { /* ISO-8859-1 */
				max_i = 1; max_j = 4; max_k = 64;
			} else {
				max_i = 0x1E; max_j = 64; max_k = 64;
			}

			for (i = 0; i < max_i; i++) {
				if (ms_table[i] == empty_stage2_table)
					continue;
				for (j = 0; j < max_j; j++) {
					if (ms_table[i][j] == empty_stage3_table)
						continue;
					for (k = 0; k < max_k; k++) {
						const entity_stage3_row *r = &ms_table[i][j][k];
						unsigned code;

						if (r->data.ent.entity == NULL)
							continue;

						code = ENT_CODE_POINT_FROM_STAGES(i, j, k);
						if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
								(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
							continue;
						write_s3row_data(r, code, charset, return_value);
					}
				}
			}
		} else {
			/* we have to iterate through the set of code points for this
			 * encoding and map them to unicode code points */
			unsigned i;
			for (i = 0; i <= 0xFF; i++) {
				const entity_stage3_row *r;
				unsigned uni_cp;

				/* can be done before mapping, they're invariant */
				if (((i == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
						(i == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
					continue;

				map_to_unicode(i, to_uni_table, &uni_cp);
				r = &ms_table[ENT_STAGE1_INDEX(uni_cp)][ENT_STAGE2_INDEX(uni_cp)][ENT_STAGE3_INDEX(uni_cp)];
				if (r->data.ent.entity == NULL)
					continue;

				write_s3row_data(r, i, charset, return_value);
			}
		}
	} else {
		/* we could use sizeof(stage3_table_be_apos_00000) as well */
		unsigned	  j,
					  numelems = sizeof(stage3_table_be_noapos_00000) /
							sizeof(*stage3_table_be_noapos_00000);

		for (j = 0; j < numelems; j++) {
			const entity_stage3_row *r = &entity_table.table[j];
			if (r->data.ent.entity == NULL)
				continue;

			if (((j == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
					(j == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
				continue;

			/* charset is indifferent, used cs_8859_1 for efficiency */
			write_s3row_data(r, j, cs_8859_1, return_value);
		}
	}
}
/* }}} */

/*
 * Local variables:
 * tab-width: 4
 * c-basic-offset: 4
 * End:
 * vim600: sw=4 ts=4 fdm=marker
 * vim<600: sw=4 ts=4
 */

Filemanager

Name Type Size Permission Actions
array.c File 130.19 KB 0644
assert.c File 9.31 KB 0644
base64.c File 7.73 KB 0644
base64.h File 1.57 KB 0644
basic_functions.c File 169.08 KB 0644
basic_functions.h File 7.42 KB 0644
browscap.c File 16.82 KB 0644
crc32.c File 1.77 KB 0644
crc32.h File 4.78 KB 0644
credits.c File 5.91 KB 0644
credits.h File 1.7 KB 0644
credits_ext.h File 5.51 KB 0644
credits_sapi.h File 1.63 KB 0644
crypt.c File 8.43 KB 0644
crypt_blowfish.c File 31.68 KB 0644
crypt_blowfish.h File 1.05 KB 0644
crypt_freesec.c File 21.64 KB 0644
crypt_freesec.h File 662 B 0644
crypt_sha256.c File 21.77 KB 0644
crypt_sha512.c File 26.45 KB 0644
css.c File 2.43 KB 0644
css.h File 1.21 KB 0644
cyr_convert.c File 11.56 KB 0644
datetime.c File 3.85 KB 0644
dir.c File 15.08 KB 0644
dl.c File 9.18 KB 0644
dl.h File 1.57 KB 0644
dns.c File 27.68 KB 0644
exec.c File 13.13 KB 0644
exec.h File 1.69 KB 0644
file.c File 68.46 KB 0644
file.h File 4.63 KB 0644
filestat.c File 34.39 KB 0644
filters.c File 56.51 KB 0644
flock_compat.c File 6.9 KB 0644
formatted_print.c File 20.19 KB 0644
fsock.c File 3.89 KB 0644
ftok.c File 2.22 KB 0644
ftp_fopen_wrapper.c File 32.1 KB 0644
head.c File 9.18 KB 0644
head.h File 1.62 KB 0644
html.c File 48.15 KB 0644
html.h File 2.71 KB 0644
html_tables.h File 471.57 KB 0644
http.c File 7.7 KB 0644
http_fopen_wrapper.c File 33.65 KB 0644
image.c File 40.83 KB 0644
incomplete_class.c File 5.61 KB 0644
info.c File 44.03 KB 0644
info.h File 20.2 KB 0644
iptc.c File 9.85 KB 0644
lcg.c File 3.11 KB 0644
levenshtein.c File 4.05 KB 0644
link.c File 5.83 KB 0644
mail.c File 13.74 KB 0644
math.c File 29.12 KB 0644
md5.c File 10.65 KB 0644
md5.h File 2.12 KB 0644
metaphone.c File 11.84 KB 0644
microtime.c File 4.36 KB 0644
pack.c File 27.05 KB 0644
pack.h File 1.25 KB 0644
pageinfo.c File 3.92 KB 0644
password.c File 12.06 KB 0644
php_array.h File 4.62 KB 0644
php_assert.h File 1.4 KB 0644
php_browscap.h File 1.3 KB 0644
php_crypt.h File 1.63 KB 0644
php_crypt_r.c File 10.78 KB 0644
php_crypt_r.h File 2 KB 0644
php_dir.h File 1.67 KB 0644
php_dns.h File 2.82 KB 0644
php_ext_syslog.h File 1.47 KB 0644
php_filestat.h File 3.28 KB 0644
php_fopen_wrapper.c File 11.49 KB 0644
php_fopen_wrappers.h File 1.92 KB 0644
php_image.h File 2.37 KB 0644
php_incomplete_class.h File 2.47 KB 0644
php_lcg.h File 1.5 KB 0644
php_mail.h File 1.37 KB 0644
php_password.h File 1.58 KB 0644
php_rand.h File 2.56 KB 0644
php_smart_str.h File 6.57 KB 0644
php_smart_str_public.h File 1.29 KB 0644
php_standard.h File 2.21 KB 0644
php_string.h File 6.23 KB 0644
php_var.h File 7.33 KB 0644
php_versioning.h File 1.37 KB 0644
proc_open.c File 26 KB 0644
proc_open.h File 1.81 KB 0644
quot_print.c File 7.51 KB 0644
quot_print.h File 1.51 KB 0644
rand.c File 11.01 KB 0644
scanf.c File 29.45 KB 0644
scanf.h File 2.27 KB 0644
sha1.c File 11.58 KB 0644
sha1.h File 1.71 KB 0644
soundex.c File 3.29 KB 0644
streamsfuncs.c File 45.17 KB 0644
string.c File 135.19 KB 0644
strnatcmp.c File 4.57 KB 0644
syslog.c File 6.35 KB 0644
type.c File 9.06 KB 0644
uniqid.c File 2.62 KB 0644
url.c File 17.9 KB 0644
url.h File 2.28 KB 0644
url_scanner_ex.c File 27.89 KB 0644
url_scanner_ex.h File 2.09 KB 0644
user_filters.c File 18.47 KB 0644
uuencode.c File 6.63 KB 0644
var.c File 29.15 KB 0644
var_unserializer.c File 29.18 KB 0644
versioning.c File 5.87 KB 0644