bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version)

Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers recommendation for v3 use. (Bitbake rev: 2f4b98af93c971a8c466ffaf3c09cca0edb6e3ad) Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
2026-06-02 13:29:49 +00:00 · 2016-05-06 09:06:51 +01:00
parent 4f8959324d
commit 822eabf32d
15 changed files with 972 additions and 361 deletions
@@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry()
 class TreeBuilder(object):
    """Turn a document into a Beautiful Soup object tree."""

+    NAME = "[Unknown tree builder]"
+    ALTERNATE_NAMES = []
    features = []

    is_xml = False
+    picklable = False
    preserve_whitespace_tags = set()
    empty_element_tags = None # A tag will be considered an empty-element
                              # tag when and only when it has no contents.
@@ -153,13 +156,13 @@ class TreeBuilder(object):
            universal = self.cdata_list_attributes.get('*', [])
            tag_specific = self.cdata_list_attributes.get(
                tag_name.lower(), None)
-            for attr in attrs.keys():
+            for attr in list(attrs.keys()):
                if attr in universal or (tag_specific and attr in tag_specific):
                    # We have a "class"-type attribute whose string
                    # value is a whitespace-separated list of
                    # values. Split it into a list.
                    value = attrs[attr]
-                    if isinstance(value, basestring):
+                    if isinstance(value, str):
                        values = whitespace_re.split(value)
                    else:
                        # html5lib sometimes calls setAttributes twice
@@ -2,6 +2,7 @@ __all__ = [
    'HTML5TreeBuilder',
    ]

+from pdb import set_trace
 import warnings
 from bs4.builder import (
    PERMISSIVE,
@@ -9,7 +10,10 @@ from bs4.builder import (
    HTML_5,
    HTMLTreeBuilder,
    )
-from bs4.element import NamespacedAttribute
+from bs4.element import (
+    NamespacedAttribute,
+    whitespace_re,
+)
 import html5lib
 from html5lib.constants import namespaces
 from bs4.element import (
@@ -22,11 +26,20 @@ from bs4.element import (
 class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""

-    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+    NAME = "html5lib"

-    def prepare_markup(self, markup, user_specified_encoding):
+    features = [NAME, PERMISSIVE, HTML_5, HTML]
+
+    def prepare_markup(self, markup, user_specified_encoding,
+                       document_declared_encoding=None, exclude_encodings=None):
        # Store the user-specified encoding for use later on.
        self.user_specified_encoding = user_specified_encoding
+
+        # document_declared_encoding and exclude_encodings aren't used
+        # ATM because the html5lib TreeBuilder doesn't use
+        # UnicodeDammit.
+        if exclude_encodings:
+            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
        yield (markup, None, None, False)

    # These methods are defined by Beautiful Soup.
@@ -37,7 +50,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        doc = parser.parse(markup, encoding=self.user_specified_encoding)

        # Set the character encoding detected by the tokenizer.
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
@@ -51,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):

    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
-        return u'<html><head></head><body>%s</body></html>' % fragment
+        return '<html><head></head><body>%s</body></html>' % fragment


 class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
@@ -101,7 +114,16 @@ class AttrList(object):
    def __iter__(self):
        return list(self.attrs.items()).__iter__()
    def __setitem__(self, name, value):
-        "set attr", name, value
+        # If this attribute is a multi-valued attribute for this element,
+        # turn its value into a list.
+        list_attr = HTML5TreeBuilder.cdata_list_attributes
+        if (name in list_attr['*']
+            or (self.element.name in list_attr
+                and name in list_attr[self.element.name])):
+            # A node that is being cloned may have already undergone
+            # this procedure.
+            if not isinstance(value, list):
+                value = whitespace_re.split(value)
        self.element[name] = value
    def items(self):
        return list(self.attrs.items())
@@ -124,7 +146,7 @@ class Element(html5lib.treebuilders._base.Node):

    def appendChild(self, node):
        string_child = child = None
-        if isinstance(node, basestring):
+        if isinstance(node, str):
            # Some other piece of code decided to pass in a string
            # instead of creating a TextElement object to contain the
            # string.
@@ -139,7 +161,7 @@ class Element(html5lib.treebuilders._base.Node):
        else:
            child = node.element

-        if not isinstance(child, basestring) and child.parent is not None:
+        if not isinstance(child, str) and child.parent is not None:
            node.element.extract()

        if (string_child and self.element.contents
@@ -152,7 +174,7 @@ class Element(html5lib.treebuilders._base.Node):
            old_element.replace_with(new_element)
            self.soup._most_recent_element = new_element
        else:
-            if isinstance(node, basestring):
+            if isinstance(node, str):
                # Create a brand new NavigableString from this string.
                child = self.soup.new_string(node)

@@ -161,6 +183,12 @@ class Element(html5lib.treebuilders._base.Node):
            # immediately after the parent, if it has no children.)
            if self.element.contents:
                most_recent_element = self.element._last_descendant(False)
+            elif self.element.next_element is not None:
+                # Something from further ahead in the parse tree is
+                # being inserted into this earlier element. This is
+                # very annoying because it means an expensive search
+                # for the last element in the tree.
+                most_recent_element = self.soup._last_descendant()
            else:
                most_recent_element = self.element

@@ -172,6 +200,7 @@ class Element(html5lib.treebuilders._base.Node):
        return AttrList(self.element)

    def setAttributes(self, attributes):
+
        if attributes is not None and len(attributes) > 0:

            converted_attributes = []
@@ -183,7 +212,7 @@ class Element(html5lib.treebuilders._base.Node):

            self.soup.builder._replace_cdata_list_attribute_values(
                self.name, attributes)
-            for name, value in attributes.items():
+            for name, value in list(attributes.items()):
                self.element[name] = value

            # The attributes may contain variables that need substitution.
@@ -218,6 +247,9 @@ class Element(html5lib.treebuilders._base.Node):

    def reparentChildren(self, new_parent):
        """Move all of this tag's children into another tag."""
+        # print "MOVE", self.element.contents
+        # print "FROM", self.element
+        # print "TO", new_parent.element
        element = self.element
        new_parent_element = new_parent.element
        # Determine what this tag's next_element will be once all the children
@@ -236,17 +268,28 @@ class Element(html5lib.treebuilders._base.Node):
            new_parents_last_descendant_next_element = new_parent_element.next_element

        to_append = element.contents
-        append_after = new_parent.element.contents
+        append_after = new_parent_element.contents
        if len(to_append) > 0:
            # Set the first child's previous_element and previous_sibling
            # to elements within the new parent
            first_child = to_append[0]
-            first_child.previous_element = new_parents_last_descendant
+            if new_parents_last_descendant:
+                first_child.previous_element = new_parents_last_descendant
+            else:
+                first_child.previous_element = new_parent_element
            first_child.previous_sibling = new_parents_last_child
+            if new_parents_last_descendant:
+                new_parents_last_descendant.next_element = first_child
+            else:
+                new_parent_element.next_element = first_child
+            if new_parents_last_child:
+                new_parents_last_child.next_sibling = first_child

            # Fix the last child's next_element and next_sibling
            last_child = to_append[-1]
            last_child.next_element = new_parents_last_descendant_next_element
+            if new_parents_last_descendant_next_element:
+                new_parents_last_descendant_next_element.previous_element = last_child
            last_child.next_sibling = None

        for child in to_append:
@@ -257,6 +300,10 @@ class Element(html5lib.treebuilders._base.Node):
        element.contents = []
        element.next_element = final_next_element

+        # print "DONE WITH MOVE"
+        # print "FROM", self.element
+        # print "TO", new_parent_element
+
    def cloneNode(self):
        tag = self.soup.new_tag(self.element.name, self.namespace)
        node = Element(tag, self.soup, self.namespace)
@@ -4,10 +4,16 @@ __all__ = [
    'HTMLParserTreeBuilder',
    ]

-from HTMLParser import (
-    HTMLParser,
-    HTMLParseError,
-    )
+from html.parser import HTMLParser
+
+try:
+    from html.parser import HTMLParseError
+except ImportError as e:
+    # HTMLParseError is removed in Python 3.5. Since it can never be
+    # thrown in 3.5, we can just define our own class as a placeholder.
+    class HTMLParseError(Exception):
+        pass
+
 import sys
 import warnings

@@ -19,10 +25,10 @@ import warnings
 # At the end of this file, we monkeypatch HTMLParser so that
 # strict=True works well on Python 3.2.2.
 major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = (
-    major > 3
-    or (major == 3 and minor > 2)
-    or (major == 3 and minor == 2 and release >= 3))
+CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
+CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
+CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
+

 from bs4.element import (
    CData,
@@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser):

    def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
-        # it's fixed.
+        # it's fixed in all supported versions.
+        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
@@ -72,9 +79,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
            real_name = int(name)

        try:
-            data = unichr(real_name)
-        except (ValueError, OverflowError), e:
-            data = u"\N{REPLACEMENT CHARACTER}"
+            data = chr(real_name)
+        except (ValueError, OverflowError) as e:
+            data = "\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)

@@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser):

    def handle_pi(self, data):
        self.soup.endData()
-        if data.endswith("?") and data.lower().startswith("xml"):
-            # "An XHTML processing instruction using the trailing '?'
-            # will cause the '?' to be included in data." - HTMLParser
-            # docs.
-            #
-            # Strip the question mark so we don't end up with two
-            # question marks.
-            data = data[:-1]
        self.soup.handle_data(data)
        self.soup.endData(ProcessingInstruction)

@@ -128,26 +127,31 @@ class BeautifulSoupHTMLParser(HTMLParser):
 class HTMLParserTreeBuilder(HTMLTreeBuilder):

    is_xml = False
-    features = [HTML, STRICT, HTMLPARSER]
+    picklable = True
+    NAME = HTMLPARSER
+    features = [NAME, HTML, STRICT]

    def __init__(self, *args, **kwargs):
-        if CONSTRUCTOR_TAKES_STRICT:
+        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
            kwargs['strict'] = False
+        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
+            kwargs['convert_charrefs'] = False
        self.parser_args = (args, kwargs)

    def prepare_markup(self, markup, user_specified_encoding=None,
-                       document_declared_encoding=None):
+                       document_declared_encoding=None, exclude_encodings=None):
        """
        :return: A 4-tuple (markup, original encoding, encoding
        declared within markup, whether any characters had to be
        replaced with REPLACEMENT CHARACTER).
        """
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
            yield (markup, None, None, False)
            return

        try_encodings = [user_specified_encoding, document_declared_encoding]
-        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True,
+                               exclude_encodings=exclude_encodings)
        yield (dammit.markup, dammit.original_encoding,
               dammit.declared_html_encoding,
               dammit.contains_replacement_characters)
@@ -158,7 +162,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
        parser.soup = self.soup
        try:
            parser.feed(markup)
-        except HTMLParseError, e:
+        except HTMLParseError as e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e
@@ -4,10 +4,15 @@ __all__ = [
    ]

 from io import BytesIO
-from StringIO import StringIO
+from io import StringIO
 import collections
 from lxml import etree
-from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.element import (
+    Comment,
+    Doctype,
+    NamespacedAttribute,
+    ProcessingInstruction,
+)
 from bs4.builder import (
    FAST,
    HTML,
@@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):

    is_xml = True

+    NAME = "lxml-xml"
+    ALTERNATE_NAMES = ["xml"]
+
    # Well, it's permissive by XML parser standards.
-    features = [LXML, XML, FAST, PERMISSIVE]
+    features = [NAME, LXML, XML, FAST, PERMISSIVE]

    CHUNK_SIZE = 512

@@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            return (None, tag)

    def prepare_markup(self, markup, user_specified_encoding=None,
+                       exclude_encodings=None,
                       document_declared_encoding=None):
        """
        :yield: A series of 4-tuples.
@@ -78,12 +87,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):

        Each 4-tuple represents a strategy for parsing the document.
        """
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
            # We were given Unicode. Maybe lxml can parse Unicode on
            # this system?
            yield markup, None, document_declared_encoding, False

-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
            # No, apparently not. Convert the Unicode to UTF-8 and
            # tell lxml to parse it as UTF-8.
            yield (markup.encode("utf8"), "utf8",
@@ -95,14 +104,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        # the document as each one in turn.
        is_html = not self.is_xml
        try_encodings = [user_specified_encoding, document_declared_encoding]
-        detector = EncodingDetector(markup, try_encodings, is_html)
+        detector = EncodingDetector(
+            markup, try_encodings, is_html, exclude_encodings)
        for encoding in detector.encodings:
            yield (detector.markup, encoding, document_declared_encoding, False)

    def feed(self, markup):
        if isinstance(markup, bytes):
            markup = BytesIO(markup)
-        elif isinstance(markup, unicode):
+        elif isinstance(markup, str):
            markup = StringIO(markup)

        # Call feed() at least once, even if the markup is empty,
@@ -117,7 +127,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                if len(data) != 0:
                    self.parser.feed(data)
            self.parser.close()
-        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
            raise ParserRejectedMarkup(str(e))

    def close(self):
@@ -135,12 +145,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            self.nsmaps.append(None)
        elif len(nsmap) > 0:
            # A new namespace mapping has come into play.
-            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+            inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
            self.nsmaps.append(inverted_nsmap)
            # Also treat the namespace mapping as a set of attributes on the
            # tag, so we can recreate it later.
            attrs = attrs.copy()
-            for prefix, namespace in nsmap.items():
+            for prefix, namespace in list(nsmap.items()):
                attribute = NamespacedAttribute(
                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
                attrs[attribute] = namespace
@@ -149,7 +159,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        # from lxml with namespaces attached to their names, and
        # turn then into NamespacedAttribute objects.
        new_attrs = {}
-        for attr, value in attrs.items():
+        for attr, value in list(attrs.items()):
            namespace, attr = self._getNsTag(attr)
            if namespace is None:
                new_attrs[attr] = value
@@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            self.nsmaps.pop()

    def pi(self, target, data):
-        pass
+        self.soup.endData()
+        self.soup.handle_data(target + ' ' + data)
+        self.soup.endData(ProcessingInstruction)

    def data(self, content):
        self.soup.handle_data(content)
@@ -207,12 +219,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):

    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
-        return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+        return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment


 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

-    features = [LXML, HTML, FAST, PERMISSIVE]
+    NAME = LXML
+    ALTERNATE_NAMES = ["lxml-html"]
+
+    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
    is_xml = False

    def default_parser(self, encoding):
@@ -224,10 +239,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
            self.parser = self.parser_for(encoding)
            self.parser.feed(markup)
            self.parser.close()
-        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
            raise ParserRejectedMarkup(str(e))


    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
-        return u'<html><body>%s</body></html>' % fragment
+        return '<html><body>%s</body></html>' % fragment