Skip to content
Snippets Groups Projects
Commit 45f34467 authored by Dries Buytaert's avatar Dries Buytaert
Browse files

- Patch #212130 by Damien Tournoud, grendzy: decode_entities() should support all (X)HTML entities.

parent 86aa636c
No related branches found
No related tags found
2 merge requests!7452Issue #1797438. HTML5 validation is preventing form submit and not fully...,!789Issue #3210310: Adjust Database API to remove deprecated Drupal 9 code in Drupal 10
<?php
// $Id $
/**
* @file
* (X)HTML entities, as defined in HTML 4.01.
*
* @see http://www.w3.org/TR/html401/sgml/entities.html
*/
$html_entities = array(
'&Aacute;' => 'Á',
'&aacute;' => 'á',
'&Acirc;' => 'Â',
'&acirc;' => 'â',
'&acute;' => '´',
'&AElig;' => 'Æ',
'&aelig;' => 'æ',
'&Agrave;' => 'À',
'&agrave;' => 'à',
'&alefsym;' => 'ℵ',
'&Alpha;' => 'Α',
'&alpha;' => 'α',
'&amp;' => '&',
'&and;' => '∧',
'&ang;' => '∠',
'&Aring;' => 'Å',
'&aring;' => 'å',
'&asymp;' => '≈',
'&Atilde;' => 'Ã',
'&atilde;' => 'ã',
'&Auml;' => 'Ä',
'&auml;' => 'ä',
'&bdquo;' => '„',
'&Beta;' => 'Β',
'&beta;' => 'β',
'&brvbar;' => '¦',
'&bull;' => '•',
'&cap;' => '∩',
'&Ccedil;' => 'Ç',
'&ccedil;' => 'ç',
'&cedil;' => '¸',
'&cent;' => '¢',
'&Chi;' => 'Χ',
'&chi;' => 'χ',
'&circ;' => 'ˆ',
'&clubs;' => '♣',
'&cong;' => '≅',
'&copy;' => '©',
'&crarr;' => '↵',
'&cup;' => '∪',
'&curren;' => '¤',
'&dagger;' => '†',
'&Dagger;' => '‡',
'&darr;' => '↓',
'&dArr;' => '⇓',
'&deg;' => '°',
'&Delta;' => 'Δ',
'&delta;' => 'δ',
'&diams;' => '♦',
'&divide;' => '÷',
'&Eacute;' => 'É',
'&eacute;' => 'é',
'&Ecirc;' => 'Ê',
'&ecirc;' => 'ê',
'&Egrave;' => 'È',
'&egrave;' => 'è',
'&empty;' => '∅',
'&emsp;' => ' ',
'&ensp;' => ' ',
'&Epsilon;' => 'Ε',
'&epsilon;' => 'ε',
'&equiv;' => '≡',
'&Eta;' => 'Η',
'&eta;' => 'η',
'&ETH;' => 'Ð',
'&eth;' => 'ð',
'&Euml;' => 'Ë',
'&euml;' => 'ë',
'&euro;' => '€',
'&exist;' => '∃',
'&fnof;' => 'ƒ',
'&forall;' => '∀',
'&frac12;' => '½',
'&frac14;' => '¼',
'&frac34;' => '¾',
'&frasl;' => '⁄',
'&Gamma;' => 'Γ',
'&gamma;' => 'γ',
'&ge;' => '≥',
'&harr;' => '↔',
'&hArr;' => '⇔',
'&hearts;' => '♥',
'&hellip;' => '…',
'&Iacute;' => 'Í',
'&iacute;' => 'í',
'&Icirc;' => 'Î',
'&icirc;' => 'î',
'&iexcl;' => '¡',
'&Igrave;' => 'Ì',
'&igrave;' => 'ì',
'&image;' => 'ℑ',
'&infin;' => '∞',
'&int;' => '∫',
'&Iota;' => 'Ι',
'&iota;' => 'ι',
'&iquest;' => '¿',
'&isin;' => '∈',
'&Iuml;' => 'Ï',
'&iuml;' => 'ï',
'&Kappa;' => 'Κ',
'&kappa;' => 'κ',
'&Lambda;' => 'Λ',
'&lambda;' => 'λ',
'&lang;' => '〈',
'&laquo;' => '«',
'&larr;' => '←',
'&lArr;' => '⇐',
'&lceil;' => '⌈',
'&ldquo;' => '“',
'&le;' => '≤',
'&lfloor;' => '⌊',
'&lowast;' => '∗',
'&loz;' => '◊',
'&lrm;' => '',
'&lsaquo;' => '‹',
'&lsquo;' => '‘',
'&macr;' => '¯',
'&mdash;' => '—',
'&micro;' => 'µ',
'&middot;' => '·',
'&minus;' => '−',
'&Mu;' => 'Μ',
'&mu;' => 'μ',
'&nabla;' => '∇',
'&nbsp;' => ' ',
'&ndash;' => '–',
'&ne;' => '≠',
'&ni;' => '∋',
'&not;' => '¬',
'&notin;' => '∉',
'&nsub;' => '⊄',
'&Ntilde;' => 'Ñ',
'&ntilde;' => 'ñ',
'&Nu;' => 'Ν',
'&nu;' => 'ν',
'&Oacute;' => 'Ó',
'&oacute;' => 'ó',
'&Ocirc;' => 'Ô',
'&ocirc;' => 'ô',
'&OElig;' => 'Œ',
'&oelig;' => 'œ',
'&Ograve;' => 'Ò',
'&ograve;' => 'ò',
'&oline;' => '‾',
'&Omega;' => 'Ω',
'&omega;' => 'ω',
'&Omicron;' => 'Ο',
'&omicron;' => 'ο',
'&oplus;' => '⊕',
'&or;' => '∨',
'&ordf;' => 'ª',
'&ordm;' => 'º',
'&Oslash;' => 'Ø',
'&oslash;' => 'ø',
'&Otilde;' => 'Õ',
'&otilde;' => 'õ',
'&otimes;' => '⊗',
'&Ouml;' => 'Ö',
'&ouml;' => 'ö',
'&para;' => '¶',
'&part;' => '∂',
'&permil;' => '‰',
'&perp;' => '⊥',
'&Phi;' => 'Φ',
'&phi;' => 'φ',
'&Pi;' => 'Π',
'&pi;' => 'π',
'&piv;' => 'ϖ',
'&plusmn;' => '±',
'&pound;' => '£',
'&prime;' => '′',
'&Prime;' => '″',
'&prod;' => '∏',
'&prop;' => '∝',
'&Psi;' => 'Ψ',
'&psi;' => 'ψ',
'&radic;' => '√',
'&rang;' => '〉',
'&raquo;' => '»',
'&rarr;' => '→',
'&rArr;' => '⇒',
'&rceil;' => '⌉',
'&rdquo;' => '”',
'&real;' => 'ℜ',
'&reg;' => '®',
'&rfloor;' => '⌋',
'&Rho;' => 'Ρ',
'&rho;' => 'ρ',
'&rlm;' => '',
'&rsaquo;' => '›',
'&rsquo;' => '’',
'&sbquo;' => '‚',
'&Scaron;' => 'Š',
'&scaron;' => 'š',
'&sdot;' => '⋅',
'&sect;' => '§',
'&shy;' => '­',
'&Sigma;' => 'Σ',
'&sigma;' => 'σ',
'&sigmaf;' => 'ς',
'&sim;' => '∼',
'&spades;' => '♠',
'&sub;' => '⊂',
'&sube;' => '⊆',
'&sum;' => '∑',
'&sup1;' => '¹',
'&sup2;' => '²',
'&sup3;' => '³',
'&sup;' => '⊃',
'&supe;' => '⊇',
'&szlig;' => 'ß',
'&Tau;' => 'Τ',
'&tau;' => 'τ',
'&there4;' => '∴',
'&Theta;' => 'Θ',
'&theta;' => 'θ',
'&thetasym;' => 'ϑ',
'&thinsp;' => ' ',
'&THORN;' => 'Þ',
'&thorn;' => 'þ',
'&tilde;' => '˜',
'&times;' => '×',
'&trade;' => '™',
'&Uacute;' => 'Ú',
'&uacute;' => 'ú',
'&uarr;' => '↑',
'&uArr;' => '⇑',
'&Ucirc;' => 'Û',
'&ucirc;' => 'û',
'&Ugrave;' => 'Ù',
'&ugrave;' => 'ù',
'&uml;' => '¨',
'&upsih;' => 'ϒ',
'&Upsilon;' => 'Υ',
'&upsilon;' => 'υ',
'&Uuml;' => 'Ü',
'&uuml;' => 'ü',
'&weierp;' => '℘',
'&Xi;' => 'Ξ',
'&xi;' => 'ξ',
'&Yacute;' => 'Ý',
'&yacute;' => 'ý',
'&yen;' => '¥',
'&yuml;' => 'ÿ',
'&Yuml;' => 'Ÿ',
'&Zeta;' => 'Ζ',
'&zeta;' => 'ζ',
'&zwj;' => '‍',
'&zwnj;' => '‌',
'&gt;' => '>',
'&lt;' => '<',
'&quot;' => '"',
// Add apostrophe (XML).
'&apos;' => "'",
);
......@@ -323,33 +323,30 @@ function _mime_header_decode($matches) {
* array('<', '&', '"'). This affects both named and numerical entities.
*/
function decode_entities($text, $exclude = array()) {
static $table;
// We store named entities in a table for quick processing.
if (!isset($table)) {
// Get all named HTML entities.
$table = array_flip(get_html_translation_table(HTML_ENTITIES));
// PHP gives us ISO-8859-1 data, we need UTF-8.
$table = array_map('utf8_encode', $table);
// Add apostrophe (XML)
$table['&apos;'] = "'";
}
$newtable = array_diff($table, $exclude);
static $html_entities;
if (!isset($html_entities)) {
include DRUPAL_ROOT . '/includes/unicode.entities.inc';
}
// Flip the exclude list so that we can do quick lookups later.
$exclude = array_flip($exclude);
// Use a regexp to select all entities in one pass, to avoid decoding
// double-escaped entities twice. The PREG_REPLACE_EVAL modifier 'e' is
// being used to allow for a callback (see
// http://php.net/manual/en/reference.pcre.pattern.modifiers).
return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $newtable, $exclude)', $text);
return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $html_entities, $exclude)', $text);
}
/**
* Helper function for decode_entities
*/
function _decode_entities($prefix, $codepoint, $original, &$table, &$exclude) {
function _decode_entities($prefix, $codepoint, $original, &$html_entities, &$exclude) {
// Named entity
if (!$prefix) {
if (isset($table[$original])) {
return $table[$original];
// A named entity not in the exclude list.
if (isset($html_entities[$original]) && !isset($exclude[$html_entities[$original]])) {
return $html_entities[$original];
}
else {
return $original;
......@@ -383,7 +380,7 @@ function _decode_entities($prefix, $codepoint, $original, &$table, &$exclude) {
. chr(0x80 | ( $codepoint & 0x3F));
}
// Check for excluded characters
if (in_array($str, $exclude)) {
if (isset($exclude[$str])) {
return $original;
}
else {
......
......@@ -162,6 +162,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
'Drupal' => 'Drupal',
'<script>' => '<script>',
'&lt;script&gt;' => '<script>',
'&#60;script&#62;' => '<script>',
'&amp;lt;script&amp;gt;' => '&lt;script&gt;',
'"' => '"',
'&#34;' => '"',
......@@ -178,6 +179,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
'&#8594;' => '→',
'➼' => '➼',
'&#10172;' => '➼',
'&euro;' => '€',
);
foreach ($testcase as $input => $output) {
$this->assertEqual(decode_entities($input), $output, t('Make sure the decoded entity of @input is @output', array('@input' => $input, '@output' => $output)));
......@@ -189,6 +191,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
'Drupal' => 'Drupal',
'<script>' => '<script>',
'&lt;script&gt;' => '&lt;script>',
'&#60;script&#62;' => '&#60;script>',
'&amp;lt;script&amp;gt;' => '&amp;lt;script&amp;gt;',
'"' => '"',
'&#34;' => '&#34;',
......@@ -205,6 +208,7 @@ class UnicodeUnitTest extends DrupalWebTestCase {
'&#8594;' => '→',
'➼' => '➼',
'&#10172;' => '➼',
'&euro;' => '€',
);
$exclude = array('<', '&', '"');
foreach ($testcase as $input => $output) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment