base.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import text_type
  3. from ..constants import scopingElements, tableInsertModeElements, namespaces
  4. # The scope markers are inserted when entering object elements,
  5. # marquees, table cells, and table captions, and are used to prevent formatting
  6. # from "leaking" into tables, object elements, and marquees.
  7. Marker = None
  8. listElementsMap = {
  9. None: (frozenset(scopingElements), False),
  10. "button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
  11. "list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
  12. (namespaces["html"], "ul")])), False),
  13. "table": (frozenset([(namespaces["html"], "html"),
  14. (namespaces["html"], "table")]), False),
  15. "select": (frozenset([(namespaces["html"], "optgroup"),
  16. (namespaces["html"], "option")]), True)
  17. }
  18. class Node(object):
  19. """Represents an item in the tree"""
  20. def __init__(self, name):
  21. """Creates a Node
  22. :arg name: The tag name associated with the node
  23. """
  24. # The tag name assocaited with the node
  25. self.name = name
  26. # The parent of the current node (or None for the document node)
  27. self.parent = None
  28. # The value of the current node (applies to text nodes and comments)
  29. self.value = None
  30. # A dict holding name -> value pairs for attributes of the node
  31. self.attributes = {}
  32. # A list of child nodes of the current node. This must include all
  33. # elements but not necessarily other node types.
  34. self.childNodes = []
  35. # A list of miscellaneous flags that can be set on the node.
  36. self._flags = []
  37. def __str__(self):
  38. attributesStr = " ".join(["%s=\"%s\"" % (name, value)
  39. for name, value in
  40. self.attributes.items()])
  41. if attributesStr:
  42. return "<%s %s>" % (self.name, attributesStr)
  43. else:
  44. return "<%s>" % (self.name)
  45. def __repr__(self):
  46. return "<%s>" % (self.name)
  47. def appendChild(self, node):
  48. """Insert node as a child of the current node
  49. :arg node: the node to insert
  50. """
  51. raise NotImplementedError
  52. def insertText(self, data, insertBefore=None):
  53. """Insert data as text in the current node, positioned before the
  54. start of node insertBefore or to the end of the node's text.
  55. :arg data: the data to insert
  56. :arg insertBefore: True if you want to insert the text before the node
  57. and False if you want to insert it after the node
  58. """
  59. raise NotImplementedError
  60. def insertBefore(self, node, refNode):
  61. """Insert node as a child of the current node, before refNode in the
  62. list of child nodes. Raises ValueError if refNode is not a child of
  63. the current node
  64. :arg node: the node to insert
  65. :arg refNode: the child node to insert the node before
  66. """
  67. raise NotImplementedError
  68. def removeChild(self, node):
  69. """Remove node from the children of the current node
  70. :arg node: the child node to remove
  71. """
  72. raise NotImplementedError
  73. def reparentChildren(self, newParent):
  74. """Move all the children of the current node to newParent.
  75. This is needed so that trees that don't store text as nodes move the
  76. text in the correct way
  77. :arg newParent: the node to move all this node's children to
  78. """
  79. # XXX - should this method be made more general?
  80. for child in self.childNodes:
  81. newParent.appendChild(child)
  82. self.childNodes = []
  83. def cloneNode(self):
  84. """Return a shallow copy of the current node i.e. a node with the same
  85. name and attributes but with no parent or child nodes
  86. """
  87. raise NotImplementedError
  88. def hasContent(self):
  89. """Return true if the node has children or text, false otherwise
  90. """
  91. raise NotImplementedError
  92. class ActiveFormattingElements(list):
  93. def append(self, node):
  94. equalCount = 0
  95. if node != Marker:
  96. for element in self[::-1]:
  97. if element == Marker:
  98. break
  99. if self.nodesEqual(element, node):
  100. equalCount += 1
  101. if equalCount == 3:
  102. self.remove(element)
  103. break
  104. list.append(self, node)
  105. def nodesEqual(self, node1, node2):
  106. if not node1.nameTuple == node2.nameTuple:
  107. return False
  108. if not node1.attributes == node2.attributes:
  109. return False
  110. return True
  111. class TreeBuilder(object):
  112. """Base treebuilder implementation
  113. * documentClass - the class to use for the bottommost node of a document
  114. * elementClass - the class to use for HTML Elements
  115. * commentClass - the class to use for comments
  116. * doctypeClass - the class to use for doctypes
  117. """
  118. # pylint:disable=not-callable
  119. # Document class
  120. documentClass = None
  121. # The class to use for creating a node
  122. elementClass = None
  123. # The class to use for creating comments
  124. commentClass = None
  125. # The class to use for creating doctypes
  126. doctypeClass = None
  127. # Fragment class
  128. fragmentClass = None
  129. def __init__(self, namespaceHTMLElements):
  130. """Create a TreeBuilder
  131. :arg namespaceHTMLElements: whether or not to namespace HTML elements
  132. """
  133. if namespaceHTMLElements:
  134. self.defaultNamespace = "http://www.w3.org/1999/xhtml"
  135. else:
  136. self.defaultNamespace = None
  137. self.reset()
  138. def reset(self):
  139. self.openElements = []
  140. self.activeFormattingElements = ActiveFormattingElements()
  141. # XXX - rename these to headElement, formElement
  142. self.headPointer = None
  143. self.formPointer = None
  144. self.insertFromTable = False
  145. self.document = self.documentClass()
  146. def elementInScope(self, target, variant=None):
  147. # If we pass a node in we match that. if we pass a string
  148. # match any node with that name
  149. exactNode = hasattr(target, "nameTuple")
  150. if not exactNode:
  151. if isinstance(target, text_type):
  152. target = (namespaces["html"], target)
  153. assert isinstance(target, tuple)
  154. listElements, invert = listElementsMap[variant]
  155. for node in reversed(self.openElements):
  156. if exactNode and node == target:
  157. return True
  158. elif not exactNode and node.nameTuple == target:
  159. return True
  160. elif (invert ^ (node.nameTuple in listElements)):
  161. return False
  162. assert False # We should never reach this point
  163. def reconstructActiveFormattingElements(self):
  164. # Within this algorithm the order of steps described in the
  165. # specification is not quite the same as the order of steps in the
  166. # code. It should still do the same though.
  167. # Step 1: stop the algorithm when there's nothing to do.
  168. if not self.activeFormattingElements:
  169. return
  170. # Step 2 and step 3: we start with the last element. So i is -1.
  171. i = len(self.activeFormattingElements) - 1
  172. entry = self.activeFormattingElements[i]
  173. if entry == Marker or entry in self.openElements:
  174. return
  175. # Step 6
  176. while entry != Marker and entry not in self.openElements:
  177. if i == 0:
  178. # This will be reset to 0 below
  179. i = -1
  180. break
  181. i -= 1
  182. # Step 5: let entry be one earlier in the list.
  183. entry = self.activeFormattingElements[i]
  184. while True:
  185. # Step 7
  186. i += 1
  187. # Step 8
  188. entry = self.activeFormattingElements[i]
  189. clone = entry.cloneNode() # Mainly to get a new copy of the attributes
  190. # Step 9
  191. element = self.insertElement({"type": "StartTag",
  192. "name": clone.name,
  193. "namespace": clone.namespace,
  194. "data": clone.attributes})
  195. # Step 10
  196. self.activeFormattingElements[i] = element
  197. # Step 11
  198. if element == self.activeFormattingElements[-1]:
  199. break
  200. def clearActiveFormattingElements(self):
  201. entry = self.activeFormattingElements.pop()
  202. while self.activeFormattingElements and entry != Marker:
  203. entry = self.activeFormattingElements.pop()
  204. def elementInActiveFormattingElements(self, name):
  205. """Check if an element exists between the end of the active
  206. formatting elements and the last marker. If it does, return it, else
  207. return false"""
  208. for item in self.activeFormattingElements[::-1]:
  209. # Check for Marker first because if it's a Marker it doesn't have a
  210. # name attribute.
  211. if item == Marker:
  212. break
  213. elif item.name == name:
  214. return item
  215. return False
  216. def insertRoot(self, token):
  217. element = self.createElement(token)
  218. self.openElements.append(element)
  219. self.document.appendChild(element)
  220. def insertDoctype(self, token):
  221. name = token["name"]
  222. publicId = token["publicId"]
  223. systemId = token["systemId"]
  224. doctype = self.doctypeClass(name, publicId, systemId)
  225. self.document.appendChild(doctype)
  226. def insertComment(self, token, parent=None):
  227. if parent is None:
  228. parent = self.openElements[-1]
  229. parent.appendChild(self.commentClass(token["data"]))
  230. def createElement(self, token):
  231. """Create an element but don't insert it anywhere"""
  232. name = token["name"]
  233. namespace = token.get("namespace", self.defaultNamespace)
  234. element = self.elementClass(name, namespace)
  235. element.attributes = token["data"]
  236. return element
  237. def _getInsertFromTable(self):
  238. return self._insertFromTable
  239. def _setInsertFromTable(self, value):
  240. """Switch the function used to insert an element from the
  241. normal one to the misnested table one and back again"""
  242. self._insertFromTable = value
  243. if value:
  244. self.insertElement = self.insertElementTable
  245. else:
  246. self.insertElement = self.insertElementNormal
  247. insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
  248. def insertElementNormal(self, token):
  249. name = token["name"]
  250. assert isinstance(name, text_type), "Element %s not unicode" % name
  251. namespace = token.get("namespace", self.defaultNamespace)
  252. element = self.elementClass(name, namespace)
  253. element.attributes = token["data"]
  254. self.openElements[-1].appendChild(element)
  255. self.openElements.append(element)
  256. return element
  257. def insertElementTable(self, token):
  258. """Create an element and insert it into the tree"""
  259. element = self.createElement(token)
  260. if self.openElements[-1].name not in tableInsertModeElements:
  261. return self.insertElementNormal(token)
  262. else:
  263. # We should be in the InTable mode. This means we want to do
  264. # special magic element rearranging
  265. parent, insertBefore = self.getTableMisnestedNodePosition()
  266. if insertBefore is None:
  267. parent.appendChild(element)
  268. else:
  269. parent.insertBefore(element, insertBefore)
  270. self.openElements.append(element)
  271. return element
  272. def insertText(self, data, parent=None):
  273. """Insert text data."""
  274. if parent is None:
  275. parent = self.openElements[-1]
  276. if (not self.insertFromTable or (self.insertFromTable and
  277. self.openElements[-1].name
  278. not in tableInsertModeElements)):
  279. parent.insertText(data)
  280. else:
  281. # We should be in the InTable mode. This means we want to do
  282. # special magic element rearranging
  283. parent, insertBefore = self.getTableMisnestedNodePosition()
  284. parent.insertText(data, insertBefore)
  285. def getTableMisnestedNodePosition(self):
  286. """Get the foster parent element, and sibling to insert before
  287. (or None) when inserting a misnested table node"""
  288. # The foster parent element is the one which comes before the most
  289. # recently opened table element
  290. # XXX - this is really inelegant
  291. lastTable = None
  292. fosterParent = None
  293. insertBefore = None
  294. for elm in self.openElements[::-1]:
  295. if elm.name == "table":
  296. lastTable = elm
  297. break
  298. if lastTable:
  299. # XXX - we should really check that this parent is actually a
  300. # node here
  301. if lastTable.parent:
  302. fosterParent = lastTable.parent
  303. insertBefore = lastTable
  304. else:
  305. fosterParent = self.openElements[
  306. self.openElements.index(lastTable) - 1]
  307. else:
  308. fosterParent = self.openElements[0]
  309. return fosterParent, insertBefore
  310. def generateImpliedEndTags(self, exclude=None):
  311. name = self.openElements[-1].name
  312. # XXX td, th and tr are not actually needed
  313. if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
  314. name != exclude):
  315. self.openElements.pop()
  316. # XXX This is not entirely what the specification says. We should
  317. # investigate it more closely.
  318. self.generateImpliedEndTags(exclude)
  319. def getDocument(self):
  320. """Return the final tree"""
  321. return self.document
  322. def getFragment(self):
  323. """Return the final fragment"""
  324. # assert self.innerHTML
  325. fragment = self.fragmentClass()
  326. self.openElements[0].reparentChildren(fragment)
  327. return fragment
  328. def testSerializer(self, node):
  329. """Serialize the subtree of node in the format required by unit tests
  330. :arg node: the node from which to start serializing
  331. """
  332. raise NotImplementedError