revert: fix(ngSanitize): follow HTML parser rules for start tags / allow < in text content

This reverts commit 36d2658b94. This commit broke the ci-checks task when ported into v1.2.x --- I will sort this out shortly.
2014-07-16 18:13:17 -04:00
parent af5aacce05
commit 25d3d3730d
2 changed files with 15 additions and 56 deletions
@@ -154,11 +154,11 @@ function sanitizeText(chars) {

 // Regular Expressions for parsing tags and attributes
 var START_TAG_REGEXP =
-       /^<((?:[a-zA-Z])[\w:-]*)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*(>?)/,
-  END_TAG_REGEXP = /^<\/\s*([\w:-]+)[^>]*>/,
+       /^<\s*([\w:-]+)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*>/,
+  END_TAG_REGEXP = /^<\s*\/\s*([\w:-]+)[^>]*>/,
  ATTR_REGEXP = /([\w:-]+)(?:\s*=\s*(?:(?:"((?:[^"])*)")|(?:'((?:[^'])*)')|([^>\s]+)))?/g,
  BEGIN_TAG_REGEXP = /^</,
-  BEGING_END_TAGE_REGEXP = /^<\//,
+  BEGING_END_TAGE_REGEXP = /^<\s*\//,
  COMMENT_REGEXP = /<!--(.*?)-->/g,
  DOCTYPE_REGEXP = /<!DOCTYPE([^>]*?)>/i,
  CDATA_REGEXP = /<!\[CDATA\[(.*?)]]>/g,
@@ -232,11 +232,10 @@ function makeMap(str) {
 * @param {object} handler
 */
 function htmlParser( html, handler ) {
-  var index, chars, match, stack = [], last = html, text;
+  var index, chars, match, stack = [], last = html;
  stack.last = function() { return stack[ stack.length - 1 ]; };

  while ( html ) {
-    text = '';
    chars = true;

    // Make sure we're not in a script or style element
@@ -275,23 +274,16 @@ function htmlParser( html, handler ) {
        match = html.match( START_TAG_REGEXP );

        if ( match ) {
-          // We only have a valid start-tag if there is a '>'.
-          if ( match[4] ) {
-            html = html.substring( match[0].length );
-            match[0].replace( START_TAG_REGEXP, parseStartTag );
-          }
+          html = html.substring( match[0].length );
+          match[0].replace( START_TAG_REGEXP, parseStartTag );
          chars = false;
-        } else {
-          // no ending tag found --- this piece should be encoded as an entity.
-          text += '<';
-          html = html.substring(1);
        }
      }

      if ( chars ) {
        index = html.indexOf("<");

-        text += index < 0 ? html : html.substring( 0, index );
+        var text = index < 0 ? html : html.substring( 0, index );
        html = index < 0 ? "" : html.substring( index );

        if (handler.chars) handler.chars( decodeEntities(text) );
@@ -21,7 +21,6 @@ describe('HTML', function() {

    var handler, start, text, comment;
    beforeEach(function() {
-      text = "";
      handler = {
        start: function(tag, attrs, unary){
          start = {
@@ -36,7 +35,7 @@ describe('HTML', function() {
          });
        },
        chars: function(text_){
-          text += text_;
+          text = text_;
        },
        end:function(tag) {
          expect(tag).toEqual(start.tag);
@@ -82,31 +81,8 @@ describe('HTML', function() {
      expect(text).toEqual('text');
    });

-    it('should not treat "<" followed by a non-/ or non-letter as a tag', function() {
-      expectHTML('<- text1 text2 <1 text1 text2 <{', handler).
-        toBe('&lt;- text1 text2 &lt;1 text1 text2 &lt;{');
-    });
-
-    it('should throw badparse if text content contains "<" followed by "/" without matching ">"', function() {
-      expect(function() {
-        htmlParser('foo </ bar', handler);
-      }).toThrowMinErr('$sanitize', 'badparse', 'The sanitizer was unable to parse the following block of html: </ bar');
-    });
-
-    it('should throw badparse if text content contains "<" followed by an ASCII letter without matching ">"', function() {
-      expect(function() {
-        htmlParser('foo <a bar', handler);
-      }).toThrowMinErr('$sanitize', 'badparse', 'The sanitizer was unable to parse the following block of html: <a bar');
-    });
-
-    it('should accept tag delimiters such as "<" inside real tags', function() {
-      // Assert that the < is part of the text node content, and not part of a tag name.
-      htmlParser('<p> 10 < 100 </p>', handler);
-      expect(text).toEqual(' 10 < 100 ');
-    });
-
    it('should parse newlines in tags', function() {
-      htmlParser('<tag\n attr="value"\n>text</\ntag\n>', handler);
+      htmlParser('<\ntag\n attr="value"\n>text<\n/\ntag\n>', handler);
      expect(start).toEqual({tag:'tag', attrs:{attr:'value'}, unary:false});
      expect(text).toEqual('text');
    });
@@ -147,9 +123,8 @@ describe('HTML', function() {
    expectHTML('a<!DocTyPe html>c.').toEqual('ac.');
  });

-  it('should escape non-start tags', function() {
-    expectHTML('a< SCRIPT >A< SCRIPT >evil< / scrIpt >B< / scrIpt >c.').
-      toBe('a&lt; SCRIPT &gt;A&lt; SCRIPT &gt;evil&lt; / scrIpt &gt;B&lt; / scrIpt &gt;c.');
+  it('should remove nested script', function() {
+    expectHTML('a< SCRIPT >A< SCRIPT >evil< / scrIpt >B< / scrIpt >c.').toEqual('ac.');
  });

  it('should remove attrs', function() {
@@ -190,16 +165,14 @@ describe('HTML', function() {
    expectHTML(everything).toEqual(everything);
  });

-  it('should mangle improper html', function() {
-    // This text is encoded more than a real HTML parser would, but it should render the same.
+  it('should handle improper html', function() {
    expectHTML('< div rel="</div>" alt=abc dir=\'"\' >text< /div>').
-      toBe('&lt; div rel=&#34;&#34; alt=abc dir=\'&#34;\' &gt;text&lt; /div&gt;');
+      toEqual('<div rel="&lt;/div&gt;" alt="abc" dir="&#34;">text</div>');
  });

-  it('should mangle improper html2', function() {
-    // A proper HTML parser would clobber this more in most cases, but it looks reasonable.
+  it('should handle improper html2', function() {
    expectHTML('< div rel="</div>" / >').
-      toBe('&lt; div rel=&#34;&#34; / &gt;');
+      toEqual('<div rel="&lt;/div&gt;"/>');
  });

  it('should ignore back slash as escape', function() {
@@ -222,12 +195,6 @@ describe('HTML', function() {
    expectHTML('\na\n').toEqual('&#10;a&#10;');
  });

-  it('should accept tag delimiters such as "<" inside real tags (with nesting)', function() {
-    //this is an integrated version of the 'should accept tag delimiters such as "<" inside real tags' test
-    expectHTML('<p> 10 < <span>100</span> </p>')
-    .toEqual('<p> 10 &lt; <span>100</span> </p>');
-  });
-
  describe('htmlSanitizerWriter', function() {
    /* global htmlSanitizeWriter: false */
    if (angular.isUndefined(window.htmlSanitizeWriter)) return;