From 667bf95308cfbaac5929bee70e9ba98fa92ed3db Mon Sep 17 00:00:00 2001
From: Dries Buytaert <dries@buytaert.net>
Date: Sun, 20 May 2007 16:44:35 +0000
Subject: [PATCH] - Patch #54833 by Steven: added an HTML corrector.

---
 CHANGELOG.txt                 |  3 ++
 modules/filter/filter.module  | 80 ++++++++++++++++++++++++++++++++++-
 modules/system/system.install | 28 ++++++++++++
 3 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index dd927d66e557..960321085239 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -40,6 +40,9 @@ Drupal 6.0, xxxx-xx-xx (development version)
     * Added form to provide initial site information during installation.
     * Added ability to provide extra installation steps programmatically.
     * Made it possible to import interface translations at install time.
+- Added the HTML corrector filter:
+    * Fixes faulty and chopped off HTML in postings.
+    * Tags are now automatically closed at the end of the teaser.
 
 Drupal 5.0, 2007-01-15
 ----------------------
diff --git a/modules/filter/filter.module b/modules/filter/filter.module
index d891ab2a6674..bfc7d1c494b0 100644
--- a/modules/filter/filter.module
+++ b/modules/filter/filter.module
@@ -955,7 +955,7 @@ function theme_filter_tips_more_info() {
 function filter_filter($op, $delta = 0, $format = -1, $text = '') {
   switch ($op) {
     case 'list':
-      return array(0 => t('HTML filter'), 1 => t('Line break converter'), 2 => t('URL filter'));
+      return array(0 => t('HTML filter'), 1 => t('Line break converter'), 2 => t('URL filter'), 3 => t('HTML corrector'));
 
     case 'description':
       switch ($delta) {
@@ -965,6 +965,8 @@ function filter_filter($op, $delta = 0, $format = -1, $text = '') {
           return t('Converts line breaks into HTML (i.e. &lt;br&gt; and &lt;p&gt; tags).');
         case 2:
           return t('Turns web and e-mail addresses into clickable links.');
+        case 3:
+          return t('Corrects faulty and chopped off HTML in postings.');
         default:
           return;
       }
@@ -977,6 +979,8 @@ function filter_filter($op, $delta = 0, $format = -1, $text = '') {
           return _filter_autop($text);
         case 2:
           return _filter_url($text, $format);
+        case 3:
+          return _filter_htmlcorrector($text);
         default:
           return $text;
       }
@@ -1098,6 +1102,80 @@ function _filter_url($text, $format) {
   return $text;
 }
 
+/**
+ * Scan input and make sure that all HTML tags are properly closed and nested.
+ */
+function _filter_htmlcorrector($text) {
+  // Prepare tag lists.
+  static $no_nesting, $single_use;
+  if (!isset($no_nesting)) {
+    // Tags which cannot be nested but are typically left unclosed.
+    $no_nesting = drupal_map_assoc(array('li', 'p'));
+
+    // Single use tags in HTML4
+    $single_use = drupal_map_assoc(array('base', 'meta', 'link', 'hr', 'br', 'param', 'img', 'area', 'input', 'col', 'frame'));
+  }
+
+  // Properly entify angles.
+  $text = preg_replace('!<([^a-zA-Z/])!', '&lt;\1', $text);
+
+  // Split tags from text.
+  $split = preg_split('/<([^>]+?)>/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
+  // Note: PHP ensures the array consists of alternating delimiters and literals
+  // and begins and ends with a literal (inserting $null as required).
+
+  $tag = false; // Odd/even counter. Tag or no tag.
+  $stack = array();
+  $output = '';
+  foreach ($split as $value) {
+    // Process HTML tags.
+    if ($tag) {
+      list($tagname) = explode(' ', strtolower($value), 2);
+      // Closing tag
+      if ($tagname{0} == '/') {
+        $tagname = substr($tagname, 1);
+        // Discard XHTML closing tags for single use tags.
+        if (!isset($single_use[$tagname])) {
+          // See if we possibly have a matching opening tag on the stack.
+          if (in_array($tagname, $stack)) {
+            // Close other tags lingering first.
+            do {
+              $output .= '</'. $stack[0] .'>';
+            } while (array_shift($stack) != $tagname);
+          }
+          // Otherwise, discard it.
+        }
+      }
+      // Opening tag
+      else {
+        // See if we have an identical 'no nesting' tag already open and close it if found.
+        if (count($stack) && ($stack[0] == $tagname) && isset($no_nesting[$stack[0]])) {
+          $output .= '</'. array_shift($stack) .'>';
+        }
+        // Push non-single-use tags onto the stack
+        if (!isset($single_use[$tagname])) {
+          array_unshift($stack, $tagname);
+        }
+        // Add trailing slash to single-use tags as per X(HT)ML.
+        else {
+          $value = rtrim($value, ' /') . ' /';
+        }
+        $output .= '<'. $value .'>';
+      }
+    }
+    else {
+      // Passthrough all text.
+      $output .= $value;
+    }
+    $tag = !$tag;
+  }
+  // Close remaining tags.
+  while (count($stack) > 0) {
+    $output .= '</'. array_shift($stack) .'>';
+  }
+  return $output;
+}
+
 /**
  * Make links out of absolute URLs.
  */
diff --git a/modules/system/system.install b/modules/system/system.install
index 7bc70779f12d..30d4a62d3018 100644
--- a/modules/system/system.install
+++ b/modules/system/system.install
@@ -1209,12 +1209,16 @@ function system_install() {
   db_query("INSERT INTO {filters} (format, module, delta, weight) VALUES (1, 'filter', 0, 1)");
   // Line break filter.
   db_query("INSERT INTO {filters} (format, module, delta, weight) VALUES (1, 'filter', 1, 2)");
+  // HTML corrector filter.
+  db_query("INSERT INTO {filters} (format, module, delta, weight) VALUES (1, 'filter', 3, 10)");
 
   // Full HTML:
   // URL filter.
   db_query("INSERT INTO {filters} (format, module, delta, weight) VALUES (2, 'filter', 2, 0)");
   // Line break filter.
   db_query("INSERT INTO {filters} (format, module, delta, weight) VALUES (2, 'filter', 1, 1)");
+  // HTML corrector filter.
+  db_query("INSERT INTO {filters} (format, module, delta, weight) VALUES (1, 'filter', 3, 10)");
 
   db_query("INSERT INTO {variable} (name,value) VALUES ('filter_html_1','i:1;')");
 
@@ -4062,6 +4066,30 @@ function system_update_6017() {
   return $ret;
 }
 
+/**
+ * Add HTML corrector to HTML formats or replace the old module if it was in use.
+ */
+function system_update_6018() {
+  $ret = array();
+
+  // Disable htmlcorrector.module, if it exists and replace its filter.
+  if (module_exists('htmlcorrector')) {
+    module_disable(array('htmlcorrector'));
+    $ret[] = update_sql("UPDATE {filter_formats} SET module = 'filter', delta = 3 WHERE module = 'htmlcorrector'");
+    $ret[] = t('HTML Corrector module was disabled; this functionality has now been added to core.');
+    return $ret;
+  }
+
+  // Otherwise, find any format with 'HTML' in its name and add the filter at the end.
+  $result = db_query("SELECT format FROM {filter_formats} WHERE name LIKE '%HTML%'");
+  while ($format = db_fetch_object($result)) {
+    $weight = db_result(db_query("SELECT MAX(weight) FROM {filters} WHERE format = %d", $format->format));
+    db_query("INSERT INTO {filters} (format, module, delta, weight) VALUES (%d, '%s', %d, %d)", $format->format, 'filter', 3, max(10, $weight + 1));
+  }
+
+  return $ret;
+}
+
 /**
  * @} End of "defgroup updates-5.x-to-6.x"
  * The next series of updates should start at 7000.
-- 
GitLab