From c3f0797385b86a67543631e5ab9d78d76c08b421 Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Sat, 3 Jun 2023 00:13:14 +0200
Subject: [PATCH] Implement iteration cache, item cache and length cache for
node list iteration (#11330)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Implement iteration cache, item cache and length cache for node list iteration
The current implementation follows the spec requirement that the list
must be "live". This means that changes in the document must be
reflected in the existing node lists without requiring the user to
refetch the node list.
The consequence is that getting any item, or the length of the list,
always starts searching from the root element of the node list. This
results in O(n) time to get any item or the length. If there's a for
loop over the node list, this means the iterations will take O(n²) time
in total. This causes real-world performance issues with potential for
downtime (see GH-11308 and its references for details).
We fix this by introducing a caching strategy. We cache the last
iterated object in the iterator, the last requested item in the node
list, and the last length computation. To invalidate the cache, we
simply count the number of modifications made to the containing
document. If the modification number does not match what the number was
during caching, we know the document has been modified and the cache is
invalid. If this ever overflows, we saturate the modification number and
don't do any caching anymore. Note that we don't check for overflow on
64-bit systems because it would take hundreds of years to overflow.
Fixes GH-11308.
---
NEWS | 3 +
UPGRADING.INTERNALS | 18 +++
ext/dom/document.c | 35 ++++--
ext/dom/documenttype.c | 4 +-
ext/dom/dom_iterators.c | 43 ++++---
ext/dom/element.c | 9 +-
ext/dom/node.c | 18 ++-
ext/dom/nodelist.c | 110 +++++++++++++++---
ext/dom/parentnode.c | 10 ++
ext/dom/php_dom.c | 82 ++++++++++---
ext/dom/php_dom.h | 38 +++++-
ext/dom/processinginstruction.c | 2 +
...ocument_getElementsByTagName_liveness.phpt | 47 ++++++++
...tElementsByTagName_liveness_simplexml.phpt | 29 +++++
...tElementsByTagName_liveness_tree_walk.phpt | 89 ++++++++++++++
...tsByTagName_liveness_write_properties.phpt | 43 +++++++
...etElementsByTagName_liveness_xinclude.phpt | 43 +++++++
.../DOMDocument_item_cache_invalidation.phpt | 69 +++++++++++
...DOMDocument_length_cache_invalidation.phpt | 34 ++++++
...ocument_liveness_caching_invalidation.phpt | 43 +++++++
...getElementsByTagName_without_document.phpt | 16 +++
ext/libxml/libxml.c | 8 +-
ext/libxml/php_libxml.h | 31 +++++
ext/simplexml/simplexml.c | 6 +
24 files changed, 764 insertions(+), 66 deletions(-)
create mode 100644 ext/dom/tests/DOMDocument_getElementsByTagName_liveness.phpt
create mode 100644 ext/dom/tests/DOMDocument_getElementsByTagName_liveness_simplexml.phpt
create mode 100644 ext/dom/tests/DOMDocument_getElementsByTagName_liveness_tree_walk.phpt
create mode 100644 ext/dom/tests/DOMDocument_getElementsByTagName_liveness_write_properties.phpt
create mode 100644 ext/dom/tests/DOMDocument_getElementsByTagName_liveness_xinclude.phpt
create mode 100644 ext/dom/tests/DOMDocument_item_cache_invalidation.phpt
create mode 100644 ext/dom/tests/DOMDocument_length_cache_invalidation.phpt
create mode 100644 ext/dom/tests/DOMDocument_liveness_caching_invalidation.phpt
create mode 100644 ext/dom/tests/DOMElement_getElementsByTagName_without_document.phpt
diff --git a/NEWS b/NEWS
index 53e4fe73519..808f3ed8061 100644
--- a/NEWS
+++ b/NEWS
@@ -44,6 +44,9 @@ PHP NEWS
- Date:
. Implement More Appropriate Date/Time Exceptions RFC. (Derick)
+- DOM:
+ . Fix bug GH-11308 (getElementsByTagName() is O(N^2)). (nielsdos)
+
- Exif:
. Removed unneeded codepaths in exif_process_TIFF_in_JPEG(). (nielsdos)
diff --git a/UPGRADING.INTERNALS b/UPGRADING.INTERNALS
index 99b609a115b..6db2d99ec59 100644
--- a/UPGRADING.INTERNALS
+++ b/UPGRADING.INTERNALS
@@ -120,6 +120,24 @@ PHP 8.3 INTERNALS UPGRADE NOTES
- A new function dom_get_doc_props_read_only() is added to gather the document
properties in a read-only way. This function avoids allocation when there are
no document properties changed yet.
+ - The node list returned by DOMNode::getElementsByTagName() and
+ DOMNode::getElementsByTagNameNS() now caches the length and the last requested item.
+ This means that the length and the last requested item are not recalculated
+ when the node list is iterated over multiple times.
+ If you do not use the internal PHP dom APIs to modify the document, you need to
+ manually invalidate the cache using php_libxml_invalidate_node_list_cache_from_doc().
+ Furthermore, the following internal APIs were added to handle the cache:
+ . php_dom_is_cache_tag_stale_from_doc_ptr()
+ . php_dom_is_cache_tag_stale_from_node()
+ . php_dom_mark_cache_tag_up_to_date_from_node()
+ - The function dom_get_elements_by_tag_name_ns_raw() has an additional parameter to indicate
+ the base node of the node list. This function also no longer accepts -1 as the index argument.
+ - The function dom_namednode_iter() has additional arguments to avoid recomputing the length of
+ the strings.
+
+ g. ext/libxml
+ - Two new functions: php_libxml_invalidate_node_list_cache_from_doc() and
+ php_libxml_invalidate_node_list_cache() were added to invalidate the cache of a node list.
========================
4. OpCode changes
diff --git a/ext/dom/document.c b/ext/dom/document.c
index 13324645e98..06c4b2b97b9 100644
--- a/ext/dom/document.c
+++ b/ext/dom/document.c
@@ -777,7 +777,6 @@ PHP_METHOD(DOMDocument, getElementsByTagName)
size_t name_len;
dom_object *intern, *namednode;
char *name;
- xmlChar *local;
id = ZEND_THIS;
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &name, &name_len) == FAILURE) {
@@ -788,8 +787,7 @@ PHP_METHOD(DOMDocument, getElementsByTagName)
php_dom_create_iterator(return_value, DOM_NODELIST);
namednode = Z_DOMOBJ_P(return_value);
- local = xmlCharStrndup(name, name_len);
- dom_namednode_iter(intern, 0, namednode, NULL, local, NULL);
+ dom_namednode_iter(intern, 0, namednode, NULL, name, name_len, NULL, 0);
}
/* }}} end dom_document_get_elements_by_tag_name */
@@ -847,6 +845,8 @@ PHP_METHOD(DOMDocument, importNode)
}
}
+ php_libxml_invalidate_node_list_cache_from_doc(docp);
+
DOM_RET_OBJ((xmlNodePtr) retnodep, &ret, intern);
}
/* }}} end dom_document_import_node */
@@ -991,7 +991,6 @@ PHP_METHOD(DOMDocument, getElementsByTagNameNS)
size_t uri_len, name_len;
dom_object *intern, *namednode;
char *uri, *name;
- xmlChar *local, *nsuri;
id = ZEND_THIS;
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s!s", &uri, &uri_len, &name, &name_len) == FAILURE) {
@@ -1002,9 +1001,7 @@ PHP_METHOD(DOMDocument, getElementsByTagNameNS)
php_dom_create_iterator(return_value, DOM_NODELIST);
namednode = Z_DOMOBJ_P(return_value);
- local = xmlCharStrndup(name, name_len);
- nsuri = xmlCharStrndup(uri ? uri : "", uri_len);
- dom_namednode_iter(intern, 0, namednode, NULL, local, nsuri);
+ dom_namednode_iter(intern, 0, namednode, NULL, name, name_len, uri ? uri : "", uri_len);
}
/* }}} end dom_document_get_elements_by_tag_name_ns */
@@ -1070,6 +1067,8 @@ PHP_METHOD(DOMDocument, normalizeDocument)
DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
+ php_libxml_invalidate_node_list_cache_from_doc(docp);
+
dom_normalize((xmlNodePtr) docp);
}
/* }}} end dom_document_normalize_document */
@@ -1328,10 +1327,14 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
if (id != NULL) {
intern = Z_DOMOBJ_P(id);
+ size_t old_modification_nr = 0;
if (intern != NULL) {
docp = (xmlDocPtr) dom_object_get_node(intern);
doc_prop = NULL;
if (docp != NULL) {
+ const php_libxml_doc_ptr *doc_ptr = docp->_private;
+ ZEND_ASSERT(doc_ptr != NULL); /* Must exist, we have a document */
+ old_modification_nr = doc_ptr->cache_tag.modification_nr;
php_libxml_decrement_node_ptr((php_libxml_node_object *) intern);
doc_prop = intern->document->doc_props;
intern->document->doc_props = NULL;
@@ -1348,6 +1351,12 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
}
php_libxml_increment_node_ptr((php_libxml_node_object *)intern, (xmlNodePtr)newdoc, (void *)intern);
+ /* Since iterators should invalidate, we need to start the modification number from the old counter */
+ if (old_modification_nr != 0) {
+ php_libxml_doc_ptr* doc_ptr = (php_libxml_doc_ptr*) ((php_libxml_node_object*) intern)->node; /* downcast */
+ doc_ptr->cache_tag.modification_nr = old_modification_nr;
+ php_libxml_invalidate_node_list_cache(doc_ptr);
+ }
RETURN_TRUE;
} else {
@@ -1563,6 +1572,8 @@ PHP_METHOD(DOMDocument, xinclude)
php_dom_remove_xinclude_nodes(root);
}
+ php_libxml_invalidate_node_list_cache_from_doc(docp);
+
if (err) {
RETVAL_LONG(err);
} else {
@@ -1871,10 +1882,14 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */
if (id != NULL && instanceof_function(Z_OBJCE_P(id), dom_document_class_entry)) {
intern = Z_DOMOBJ_P(id);
+ size_t old_modification_nr = 0;
if (intern != NULL) {
docp = (xmlDocPtr) dom_object_get_node(intern);
doc_prop = NULL;
if (docp != NULL) {
+ const php_libxml_doc_ptr *doc_ptr = docp->_private;
+ ZEND_ASSERT(doc_ptr != NULL); /* Must exist, we have a document */
+ old_modification_nr = doc_ptr->cache_tag.modification_nr;
php_libxml_decrement_node_ptr((php_libxml_node_object *) intern);
doc_prop = intern->document->doc_props;
intern->document->doc_props = NULL;
@@ -1891,6 +1906,12 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */
}
php_libxml_increment_node_ptr((php_libxml_node_object *)intern, (xmlNodePtr)newdoc, (void *)intern);
+ /* Since iterators should invalidate, we need to start the modification number from the old counter */
+ if (old_modification_nr != 0) {
+ php_libxml_doc_ptr* doc_ptr = (php_libxml_doc_ptr*) ((php_libxml_node_object*) intern)->node; /* downcast */
+ doc_ptr->cache_tag.modification_nr = old_modification_nr;
+ php_libxml_invalidate_node_list_cache(doc_ptr);
+ }
RETURN_TRUE;
} else {
diff --git a/ext/dom/documenttype.c b/ext/dom/documenttype.c
index b046b05f80e..cfc4b043edb 100644
--- a/ext/dom/documenttype.c
+++ b/ext/dom/documenttype.c
@@ -65,7 +65,7 @@ int dom_documenttype_entities_read(dom_object *obj, zval *retval)
entityht = (xmlHashTable *) doctypep->entities;
intern = Z_DOMOBJ_P(retval);
- dom_namednode_iter(obj, XML_ENTITY_NODE, intern, entityht, NULL, NULL);
+ dom_namednode_iter(obj, XML_ENTITY_NODE, intern, entityht, NULL, 0, NULL, 0);
return SUCCESS;
}
@@ -93,7 +93,7 @@ int dom_documenttype_notations_read(dom_object *obj, zval *retval)
notationht = (xmlHashTable *) doctypep->notations;
intern = Z_DOMOBJ_P(retval);
- dom_namednode_iter(obj, XML_NOTATION_NODE, intern, notationht, NULL, NULL);
+ dom_namednode_iter(obj, XML_NOTATION_NODE, intern, notationht, NULL, 0, NULL, 0);
return SUCCESS;
}
diff --git a/ext/dom/dom_iterators.c b/ext/dom/dom_iterators.c
index 72c97104db0..2cf2c7bb6e7 100644
--- a/ext/dom/dom_iterators.c
+++ b/ext/dom/dom_iterators.c
@@ -179,7 +179,7 @@ static void php_dom_iterator_move_forward(zend_object_iterator *iter) /* {{{ */
dom_object *intern;
dom_object *nnmap;
dom_nnodemap_object *objmap;
- int previndex=0;
+ int previndex;
HashTable *nodeht;
zval *entry;
bool do_curobj_undef = 1;
@@ -205,23 +205,32 @@ static void php_dom_iterator_move_forward(zend_object_iterator *iter) /* {{{ */
do_curobj_undef = 0;
}
} else {
- curnode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
if (objmap->nodetype == XML_ATTRIBUTE_NODE ||
objmap->nodetype == XML_ELEMENT_NODE) {
+ curnode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
curnode = curnode->next;
} else {
- /* Nav the tree evey time as this is LIVE */
+ /* The collection is live, we nav the tree from the base object if we cannot
+ * use the cache to restart from the last point. */
basenode = dom_object_get_node(objmap->baseobj);
- if (basenode && (basenode->type == XML_DOCUMENT_NODE ||
- basenode->type == XML_HTML_DOCUMENT_NODE)) {
- basenode = xmlDocGetRootElement((xmlDoc *) basenode);
- } else if (basenode) {
- basenode = basenode->children;
- } else {
+ if (UNEXPECTED(!basenode)) {
goto err;
}
+ if (php_dom_is_cache_tag_stale_from_node(&iterator->cache_tag, basenode)) {
+ php_dom_mark_cache_tag_up_to_date_from_node(&iterator->cache_tag, basenode);
+ previndex = 0;
+ if (basenode && (basenode->type == XML_DOCUMENT_NODE ||
+ basenode->type == XML_HTML_DOCUMENT_NODE)) {
+ curnode = xmlDocGetRootElement((xmlDoc *) basenode);
+ } else {
+ curnode = basenode->children;
+ }
+ } else {
+ previndex = iter->index - 1;
+ curnode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
+ }
curnode = dom_get_elements_by_tag_name_ns_raw(
- basenode, (char *) objmap->ns, (char *) objmap->local, &previndex, iter->index);
+ basenode, curnode, (char *) objmap->ns, (char *) objmap->local, &previndex, iter->index);
}
}
} else {
@@ -258,7 +267,7 @@ zend_object_iterator *php_dom_get_iterator(zend_class_entry *ce, zval *object, i
{
dom_object *intern;
dom_nnodemap_object *objmap;
- xmlNodePtr nodep, curnode=NULL;
+ xmlNodePtr curnode=NULL;
int curindex = 0;
HashTable *nodeht;
zval *entry;
@@ -270,6 +279,7 @@ zend_object_iterator *php_dom_get_iterator(zend_class_entry *ce, zval *object, i
}
iterator = emalloc(sizeof(php_dom_iterator));
zend_iterator_init(&iterator->intern);
+ iterator->cache_tag.modification_nr = 0;
ZVAL_OBJ_COPY(&iterator->intern.data, Z_OBJ_P(object));
iterator->intern.funcs = &php_dom_iterator_funcs;
@@ -288,24 +298,25 @@ zend_object_iterator *php_dom_get_iterator(zend_class_entry *ce, zval *object, i
ZVAL_COPY(&iterator->curobj, entry);
}
} else {
- nodep = (xmlNode *)dom_object_get_node(objmap->baseobj);
- if (!nodep) {
+ xmlNodePtr basep = (xmlNode *)dom_object_get_node(objmap->baseobj);
+ if (!basep) {
goto err;
}
if (objmap->nodetype == XML_ATTRIBUTE_NODE || objmap->nodetype == XML_ELEMENT_NODE) {
if (objmap->nodetype == XML_ATTRIBUTE_NODE) {
- curnode = (xmlNodePtr) nodep->properties;
+ curnode = (xmlNodePtr) basep->properties;
} else {
- curnode = (xmlNodePtr) nodep->children;
+ curnode = (xmlNodePtr) basep->children;
}
} else {
+ xmlNodePtr nodep = basep;
if (nodep->type == XML_DOCUMENT_NODE || nodep->type == XML_HTML_DOCUMENT_NODE) {
nodep = xmlDocGetRootElement((xmlDoc *) nodep);
} else {
nodep = nodep->children;
}
curnode = dom_get_elements_by_tag_name_ns_raw(
- nodep, (char *) objmap->ns, (char *) objmap->local, &curindex, 0);
+ basep, nodep, (char *) objmap->ns, (char *) objmap->local, &curindex, 0);
}
}
} else {
diff --git a/ext/dom/element.c b/ext/dom/element.c
index 19cef583465..93d9ad5fb91 100644
--- a/ext/dom/element.c
+++ b/ext/dom/element.c
@@ -511,7 +511,6 @@ PHP_METHOD(DOMElement, getElementsByTagName)
size_t name_len;
dom_object *intern, *namednode;
char *name;
- xmlChar *local;
id = ZEND_THIS;
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &name, &name_len) == FAILURE) {
@@ -522,8 +521,7 @@ PHP_METHOD(DOMElement, getElementsByTagName)
php_dom_create_iterator(return_value, DOM_NODELIST);
namednode = Z_DOMOBJ_P(return_value);
- local = xmlCharStrndup(name, name_len);
- dom_namednode_iter(intern, 0, namednode, NULL, local, NULL);
+ dom_namednode_iter(intern, 0, namednode, NULL, name, name_len, NULL, 0);
}
/* }}} end dom_element_get_elements_by_tag_name */
@@ -930,7 +928,6 @@ PHP_METHOD(DOMElement, getElementsByTagNameNS)
size_t uri_len, name_len;
dom_object *intern, *namednode;
char *uri, *name;
- xmlChar *local, *nsuri;
id = ZEND_THIS;
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s!s", &uri, &uri_len, &name, &name_len) == FAILURE) {
@@ -941,9 +938,7 @@ PHP_METHOD(DOMElement, getElementsByTagNameNS)
php_dom_create_iterator(return_value, DOM_NODELIST);
namednode = Z_DOMOBJ_P(return_value);
- local = xmlCharStrndup(name, name_len);
- nsuri = xmlCharStrndup(uri ? uri : "", uri_len);
- dom_namednode_iter(intern, 0, namednode, NULL, local, nsuri);
+ dom_namednode_iter(intern, 0, namednode, NULL, name, name_len, uri ? uri : "", uri_len);
}
/* }}} end dom_element_get_elements_by_tag_name_ns */
diff --git a/ext/dom/node.c b/ext/dom/node.c
index fdb51bf5109..78c9b2dca18 100644
--- a/ext/dom/node.c
+++ b/ext/dom/node.c
@@ -195,6 +195,8 @@ int dom_node_node_value_write(dom_object *obj, zval *newval)
break;
}
+ php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
+
zend_string_release_ex(str, 0);
return SUCCESS;
}
@@ -274,7 +276,7 @@ int dom_node_child_nodes_read(dom_object *obj, zval *retval)
php_dom_create_iterator(retval, DOM_NODELIST);
intern = Z_DOMOBJ_P(retval);
- dom_namednode_iter(obj, XML_ELEMENT_NODE, intern, NULL, NULL, NULL);
+ dom_namednode_iter(obj, XML_ELEMENT_NODE, intern, NULL, NULL, 0, NULL, 0);
return SUCCESS;
}
@@ -482,7 +484,7 @@ int dom_node_attributes_read(dom_object *obj, zval *retval)
if (nodep->type == XML_ELEMENT_NODE) {
php_dom_create_iterator(retval, DOM_NAMEDNODEMAP);
intern = Z_DOMOBJ_P(retval);
- dom_namednode_iter(obj, XML_ATTRIBUTE_NODE, intern, NULL, NULL, NULL);
+ dom_namednode_iter(obj, XML_ATTRIBUTE_NODE, intern, NULL, NULL, 0, NULL, 0);
} else {
ZVAL_NULL(retval);
}
@@ -769,6 +771,8 @@ int dom_node_text_content_write(dom_object *obj, zval *newval)
return FAILURE;
}
+ php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
+
const xmlChar *xmlChars = (const xmlChar *) ZSTR_VAL(str);
int type = nodep->type;
@@ -897,6 +901,8 @@ PHP_METHOD(DOMNode, insertBefore)
php_libxml_increment_doc_ref((php_libxml_node_object *)childobj, NULL);
}
+ php_libxml_invalidate_node_list_cache_from_doc(parentp->doc);
+
if (ref != NULL) {
DOM_GET_OBJ(refp, ref, xmlNodePtr, refpobj);
if (refp->parent != parentp) {
@@ -1086,6 +1092,7 @@ PHP_METHOD(DOMNode, replaceChild)
nodep->doc->intSubset = (xmlDtd *) newchild;
}
}
+ php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
DOM_RET_OBJ(oldchild, &ret, intern);
}
/* }}} end dom_node_replace_child */
@@ -1127,6 +1134,7 @@ PHP_METHOD(DOMNode, removeChild)
}
xmlUnlinkNode(child);
+ php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
DOM_RET_OBJ(child, &ret, intern);
}
/* }}} end dom_node_remove_child */
@@ -1230,6 +1238,8 @@ PHP_METHOD(DOMNode, appendChild)
dom_reconcile_ns(nodep->doc, new_child);
+ php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
+
DOM_RET_OBJ(new_child, &ret, intern);
}
/* }}} end dom_node_append_child */
@@ -1339,6 +1349,8 @@ PHP_METHOD(DOMNode, normalize)
DOM_GET_OBJ(nodep, id, xmlNodePtr, intern);
+ php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
+
dom_normalize(nodep);
}
@@ -1571,6 +1583,8 @@ static void dom_canonicalization(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{
RETURN_THROWS();
}
+ php_libxml_invalidate_node_list_cache_from_doc(docp);
+
if (xpath_array == NULL) {
if (nodep->type != XML_DOCUMENT_NODE) {
ctxp = xmlXPathNewContext(docp);
diff --git a/ext/dom/nodelist.c b/ext/dom/nodelist.c
index b03ebe1acd9..20e3b18bee8 100644
--- a/ext/dom/nodelist.c
+++ b/ext/dom/nodelist.c
@@ -31,6 +31,24 @@
* Since:
*/
+static zend_always_inline void objmap_cache_release_cached_obj(dom_nnodemap_object *objmap)
+{
+ if (objmap->cached_obj) {
+ /* Since the DOM is a tree there can be no cycles. */
+ if (GC_DELREF(&objmap->cached_obj->std) == 0) {
+ zend_objects_store_del(&objmap->cached_obj->std);
+ }
+ objmap->cached_obj = NULL;
+ objmap->cached_obj_index = 0;
+ }
+}
+
+static zend_always_inline void reset_objmap_cache(dom_nnodemap_object *objmap)
+{
+ objmap_cache_release_cached_obj(objmap);
+ objmap->cached_length = -1;
+}
+
static int get_nodelist_length(dom_object *obj)
{
dom_nnodemap_object *objmap = (dom_nnodemap_object *) obj->ptr;
@@ -52,6 +70,17 @@ static int get_nodelist_length(dom_object *obj)
return 0;
}
+ if (!php_dom_is_cache_tag_stale_from_node(&objmap->cache_tag, nodep)) {
+ if (objmap->cached_length >= 0) {
+ return objmap->cached_length;
+ }
+ /* Only the length is out-of-date, the cache tag is still valid.
+ * Therefore, only overwrite the length and keep the currently cached object. */
+ } else {
+ php_dom_mark_cache_tag_up_to_date_from_node(&objmap->cache_tag, nodep);
+ reset_objmap_cache(objmap);
+ }
+
int count = 0;
if (objmap->nodetype == XML_ATTRIBUTE_NODE || objmap->nodetype == XML_ELEMENT_NODE) {
xmlNodePtr curnode = nodep->children;
@@ -63,15 +92,18 @@ static int get_nodelist_length(dom_object *obj)
}
}
} else {
+ xmlNodePtr basep = nodep;
if (nodep->type == XML_DOCUMENT_NODE || nodep->type == XML_HTML_DOCUMENT_NODE) {
nodep = xmlDocGetRootElement((xmlDoc *) nodep);
} else {
nodep = nodep->children;
}
dom_get_elements_by_tag_name_ns_raw(
- nodep, (char *) objmap->ns, (char *) objmap->local, &count, -1);
+ basep, nodep, (char *) objmap->ns, (char *) objmap->local, &count, INT_MAX - 1 /* because of <= */);
}
+ objmap->cached_length = count;
+
return count;
}
@@ -113,11 +145,12 @@ PHP_METHOD(DOMNodeList, item)
zval *id;
zend_long index;
int ret;
+ bool cache_itemnode = false;
dom_object *intern;
xmlNodePtr itemnode = NULL;
dom_nnodemap_object *objmap;
- xmlNodePtr nodep, curnode;
+ xmlNodePtr basep;
int count = 0;
id = ZEND_THIS;
@@ -145,23 +178,51 @@ PHP_METHOD(DOMNodeList, item)
return;
}
} else if (objmap->baseobj) {
- nodep = dom_object_get_node(objmap->baseobj);
- if (nodep) {
- if (objmap->nodetype == XML_ATTRIBUTE_NODE || objmap->nodetype == XML_ELEMENT_NODE) {
- curnode = nodep->children;
- while (count < index && curnode != NULL) {
- count++;
- curnode = curnode->next;
- }
- itemnode = curnode;
- } else {
- if (nodep->type == XML_DOCUMENT_NODE || nodep->type == XML_HTML_DOCUMENT_NODE) {
- nodep = xmlDocGetRootElement((xmlDoc *) nodep);
+ basep = dom_object_get_node(objmap->baseobj);
+ if (basep) {
+ xmlNodePtr nodep = basep;
+ /* For now we're only able to use cache for forward search.
+ * TODO: in the future we could extend the logic of the node list such that backwards searches
+ * are also possible. */
+ bool restart = true;
+ int relative_index = index;
+ if (index >= objmap->cached_obj_index && objmap->cached_obj && !php_dom_is_cache_tag_stale_from_node(&objmap->cache_tag, nodep)) {
+ xmlNodePtr cached_obj_xml_node = dom_object_get_node(objmap->cached_obj);
+
+ /* The node cannot be NULL if the cache is valid. If it is NULL, then it means we
+ * forgot an invalidation somewhere. Take the defensive programming approach and invalidate
+ * it here if it's NULL (except in debug mode where we would want to catch this). */
+ if (UNEXPECTED(cached_obj_xml_node == NULL)) {
+#if ZEND_DEBUG
+ ZEND_UNREACHABLE();
+#endif
+ reset_objmap_cache(objmap);
} else {
+ restart = false;
+ relative_index -= objmap->cached_obj_index;
+ nodep = cached_obj_xml_node;
+ }
+ }
+ if (objmap->nodetype == XML_ATTRIBUTE_NODE || objmap->nodetype == XML_ELEMENT_NODE) {
+ if (restart) {
nodep = nodep->children;
}
- itemnode = dom_get_elements_by_tag_name_ns_raw(nodep, (char *) objmap->ns, (char *) objmap->local, &count, index);
+ while (count < relative_index && nodep != NULL) {
+ count++;
+ nodep = nodep->next;
+ }
+ itemnode = nodep;
+ } else {
+ if (restart) {
+ if (basep->type == XML_DOCUMENT_NODE || basep->type == XML_HTML_DOCUMENT_NODE) {
+ nodep = xmlDocGetRootElement((xmlDoc*) basep);
+ } else {
+ nodep = basep->children;
+ }
+ }
+ itemnode = dom_get_elements_by_tag_name_ns_raw(basep, nodep, (char *) objmap->ns, (char *) objmap->local, &count, relative_index);
}
+ cache_itemnode = true;
}
}
}
@@ -169,6 +230,25 @@ PHP_METHOD(DOMNodeList, item)
if (itemnode) {
DOM_RET_OBJ(itemnode, &ret, objmap->baseobj);
+ if (cache_itemnode) {
+ /* Hold additional reference for the cache, must happen before releasing the cache
+ * because we might be the last reference holder.
+ * Instead of storing and copying zvals, we store the object pointer directly.
+ * This saves us some bytes because a pointer is smaller than a zval.
+ * This also means we have to manually refcount the objects here, and remove the reference count
+ * in reset_objmap_cache() and the destructor. */
+ dom_object *cached_obj = Z_DOMOBJ_P(return_value);
+ GC_ADDREF(&cached_obj->std);
+ /* If the tag is stale, all cached data is useless. Otherwise only the cached object is useless. */
+ if (php_dom_is_cache_tag_stale_from_node(&objmap->cache_tag, itemnode)) {
+ php_dom_mark_cache_tag_up_to_date_from_node(&objmap->cache_tag, itemnode);
+ reset_objmap_cache(objmap);
+ } else {
+ objmap_cache_release_cached_obj(objmap);
+ }
+ objmap->cached_obj_index = index;
+ objmap->cached_obj = cached_obj;
+ }
return;
}
}
diff --git a/ext/dom/parentnode.c b/ext/dom/parentnode.c
index 4d0fffeb9e0..36cd6104f38 100644
--- a/ext/dom/parentnode.c
+++ b/ext/dom/parentnode.c
@@ -280,6 +280,8 @@ void dom_parent_node_append(dom_object *context, zval *nodes, int nodesc)
return;
}
+ php_libxml_invalidate_node_list_cache_from_doc(parentNode->doc);
+
xmlNode *fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
if (fragment == NULL) {
@@ -322,6 +324,8 @@ void dom_parent_node_prepend(dom_object *context, zval *nodes, int nodesc)
return;
}
+ php_libxml_invalidate_node_list_cache_from_doc(parentNode->doc);
+
xmlNodePtr newchild, nextsib;
xmlNode *fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
@@ -402,6 +406,8 @@ void dom_parent_node_after(dom_object *context, zval *nodes, int nodesc)
doc = prevsib->doc;
+ php_libxml_invalidate_node_list_cache_from_doc(doc);
+
/* Spec step 4: convert nodes into fragment */
fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
@@ -451,6 +457,8 @@ void dom_parent_node_before(dom_object *context, zval *nodes, int nodesc)
doc = nextsib->doc;
+ php_libxml_invalidate_node_list_cache_from_doc(doc);
+
/* Spec step 4: convert nodes into fragment */
fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
@@ -506,6 +514,8 @@ void dom_child_node_remove(dom_object *context)
return;
}
+ php_libxml_invalidate_node_list_cache_from_doc(context->document->ptr);
+
while (children) {
if (children == child) {
xmlUnlinkNode(child);
diff --git a/ext/dom/php_dom.c b/ext/dom/php_dom.c
index e02b0973291..44c10ea6a8a 100644
--- a/ext/dom/php_dom.c
+++ b/ext/dom/php_dom.c
@@ -942,7 +942,7 @@ void dom_objects_free_storage(zend_object *object)
}
/* }}} */
-void dom_namednode_iter(dom_object *basenode, int ntype, dom_object *intern, xmlHashTablePtr ht, xmlChar *local, xmlChar *ns) /* {{{ */
+void dom_namednode_iter(dom_object *basenode, int ntype, dom_object *intern, xmlHashTablePtr ht, const char *local, size_t local_len, const char *ns, size_t ns_len) /* {{{ */
{
dom_nnodemap_object *mapptr = (dom_nnodemap_object *) intern->ptr;
@@ -950,11 +950,33 @@ void dom_namednode_iter(dom_object *basenode, int ntype, dom_object *intern, xml
ZVAL_OBJ_COPY(&mapptr->baseobj_zv, &basenode->std);
+ xmlDocPtr doc = basenode->document ? basenode->document->ptr : NULL;
+
mapptr->baseobj = basenode;
mapptr->nodetype = ntype;
mapptr->ht = ht;
- mapptr->local = local;
- mapptr->ns = ns;
+
+ const xmlChar* tmp;
+
+ if (local) {
+ int len = local_len > INT_MAX ? -1 : (int) local_len;
+ if (doc != NULL && (tmp = xmlDictExists(doc->dict, (const xmlChar *)local, len)) != NULL) {
+ mapptr->local = (xmlChar*) tmp;
+ } else {
+ mapptr->local = xmlCharStrndup(local, len);
+ mapptr->free_local = true;
+ }
+ }
+
+ if (ns) {
+ int len = ns_len > INT_MAX ? -1 : (int) ns_len;
+ if (doc != NULL && (tmp = xmlDictExists(doc->dict, (const xmlChar *)ns, len)) != NULL) {
+ mapptr->ns = (xmlChar*) tmp;
+ } else {
+ mapptr->ns = xmlCharStrndup(ns, len);
+ mapptr->free_ns = true;
+ }
+ }
}
/* }}} */
@@ -1010,10 +1032,13 @@ void dom_nnodemap_objects_free_storage(zend_object *object) /* {{{ */
dom_nnodemap_object *objmap = (dom_nnodemap_object *)intern->ptr;
if (objmap) {
- if (objmap->local) {
+ if (objmap->cached_obj && GC_DELREF(&objmap->cached_obj->std) == 0) {
+ zend_objects_store_del(&objmap->cached_obj->std);
+ }
+ if (objmap->free_local) {
xmlFree(objmap->local);
}
- if (objmap->ns) {
+ if (objmap->free_ns) {
xmlFree(objmap->ns);
}
if (!Z_ISUNDEF(objmap->baseobj_zv)) {
@@ -1042,7 +1067,13 @@ zend_object *dom_nnodemap_objects_new(zend_class_entry *class_type) /* {{{ */
objmap->nodetype = 0;
objmap->ht = NULL;
objmap->local = NULL;
+ objmap->free_local = false;
objmap->ns = NULL;
+ objmap->free_ns = false;
+ objmap->cache_tag.modification_nr = 0;
+ objmap->cached_length = -1;
+ objmap->cached_obj = NULL;
+ objmap->cached_obj_index = 0;
return &intern->std;
}
@@ -1220,19 +1251,25 @@ bool dom_has_feature(zend_string *feature, zend_string *version)
}
/* }}} end dom_has_feature */
-xmlNode *dom_get_elements_by_tag_name_ns_raw(xmlNodePtr nodep, char *ns, char *local, int *cur, int index) /* {{{ */
+xmlNode *dom_get_elements_by_tag_name_ns_raw(xmlNodePtr basep, xmlNodePtr nodep, char *ns, char *local, int *cur, int index) /* {{{ */
{
+ /* Can happen with detached document */
+ if (UNEXPECTED(nodep == NULL)) {
+ return NULL;
+ }
+
xmlNodePtr ret = NULL;
+ bool local_match_any = local[0] == '*' && local[1] == '\0';
/* Note: The spec says that ns == '' must be transformed to ns == NULL. In other words, they are equivalent.
* PHP however does not do this and internally uses the empty string everywhere when the user provides ns == NULL.
* This is because for PHP ns == NULL has another meaning: "match every namespace" instead of "match the empty namespace". */
bool ns_match_any = ns == NULL || (ns[0] == '*' && ns[1] == '\0');
- while (nodep != NULL && (*cur <= index || index == -1)) {
+ while (*cur <= index) {
if (nodep->type == XML_ELEMENT_NODE) {
- if (xmlStrEqual(nodep->name, (xmlChar *)local) || xmlStrEqual((xmlChar *)"*", (xmlChar *)local)) {
- if (ns_match_any || (!strcmp(ns, "") && nodep->ns == NULL) || (nodep->ns != NULL && xmlStrEqual(nodep->ns->href, (xmlChar *)ns))) {
+ if (local_match_any || xmlStrEqual(nodep->name, (xmlChar *)local)) {
+ if (ns_match_any || (ns[0] == '\0' && nodep->ns == NULL) || (nodep->ns != NULL && xmlStrEqual(nodep->ns->href, (xmlChar *)ns))) {
if (*cur == index) {
ret = nodep;
break;
@@ -1240,16 +1277,33 @@ xmlNode *dom_get_elements_by_tag_name_ns_raw(xmlNodePtr nodep, char *ns, char *l
(*cur)++;
}
}
- ret = dom_get_elements_by_tag_name_ns_raw(nodep->children, ns, local, cur, index);
- if (ret != NULL) {
- break;
+
+ if (nodep->children) {
+ nodep = nodep->children;
+ continue;
}
}
- nodep = nodep->next;
+
+ if (nodep->next) {
+ nodep = nodep->next;
+ } else {
+ /* Go upwards, until we find a parent node with a next sibling, or until we hit the base. */
+ do {
+ nodep = nodep->parent;
+ if (nodep == basep) {
+ return NULL;
+ }
+ /* This shouldn't happen, unless there's an invalidation bug somewhere. */
+ if (UNEXPECTED(nodep == NULL)) {
+ zend_throw_error(NULL, "Current node in traversal is not in the document. Please report this as a bug in php-src.");
+ return NULL;
+ }
+ } while (nodep->next == NULL);
+ nodep = nodep->next;
+ }
}
return ret;
}
-/* }}} */
/* }}} end dom_element_get_elements_by_tag_name_ns_raw */
static inline bool is_empty_node(xmlNodePtr nodep)
diff --git a/ext/dom/php_dom.h b/ext/dom/php_dom.h
index a7ae09384cf..0602f4166ea 100644
--- a/ext/dom/php_dom.h
+++ b/ext/dom/php_dom.h
@@ -82,15 +82,22 @@ typedef struct _dom_nnodemap_object {
dom_object *baseobj;
zval baseobj_zv;
int nodetype;
+ int cached_length;
xmlHashTable *ht;
xmlChar *local;
xmlChar *ns;
+ php_libxml_cache_tag cache_tag;
+ dom_object *cached_obj;
+ int cached_obj_index;
+ bool free_local : 1;
+ bool free_ns : 1;
} dom_nnodemap_object;
typedef struct {
zend_object_iterator intern;
zval curobj;
HashPosition pos;
+ php_libxml_cache_tag cache_tag;
} php_dom_iterator;
#include "domexception.h"
@@ -113,14 +120,14 @@ void dom_set_old_ns(xmlDoc *doc, xmlNs *ns);
void dom_reconcile_ns(xmlDocPtr doc, xmlNodePtr nodep);
xmlNsPtr dom_get_nsdecl(xmlNode *node, xmlChar *localName);
void dom_normalize (xmlNodePtr nodep);
-xmlNode *dom_get_elements_by_tag_name_ns_raw(xmlNodePtr nodep, char *ns, char *local, int *cur, int index);
+xmlNode *dom_get_elements_by_tag_name_ns_raw(xmlNodePtr basep, xmlNodePtr nodep, char *ns, char *local, int *cur, int index);
void php_dom_create_implementation(zval *retval);
int dom_hierarchy(xmlNodePtr parent, xmlNodePtr child);
bool dom_has_feature(zend_string *feature, zend_string *version);
int dom_node_is_read_only(xmlNodePtr node);
int dom_node_children_valid(xmlNodePtr node);
void php_dom_create_iterator(zval *return_value, int ce_type);
-void dom_namednode_iter(dom_object *basenode, int ntype, dom_object *intern, xmlHashTablePtr ht, xmlChar *local, xmlChar *ns);
+void dom_namednode_iter(dom_object *basenode, int ntype, dom_object *intern, xmlHashTablePtr ht, const char *local, size_t local_len, const char *ns, size_t ns_len);
xmlNodePtr create_notation(const xmlChar *name, const xmlChar *ExternalID, const xmlChar *SystemID);
xmlNode *php_dom_libxml_hash_iter(xmlHashTable *ht, int index);
xmlNode *php_dom_libxml_notation_iter(xmlHashTable *ht, int index);
@@ -153,6 +160,33 @@ void dom_child_node_remove(dom_object *context);
#define DOM_NODELIST 0
#define DOM_NAMEDNODEMAP 1
+static zend_always_inline bool php_dom_is_cache_tag_stale_from_doc_ptr(const php_libxml_cache_tag *cache_tag, const php_libxml_doc_ptr *doc_ptr)
+{
+ ZEND_ASSERT(cache_tag != NULL);
+ ZEND_ASSERT(doc_ptr != NULL);
+ /* See overflow comment in php_libxml_invalidate_node_list_cache(). */
+#if SIZEOF_SIZE_T == 8
+ return cache_tag->modification_nr != doc_ptr->cache_tag.modification_nr;
+#else
+ return cache_tag->modification_nr != doc_ptr->cache_tag.modification_nr || UNEXPECTED(doc_ptr->cache_tag.modification_nr == SIZE_MAX);
+#endif
+}
+
+static zend_always_inline bool php_dom_is_cache_tag_stale_from_node(const php_libxml_cache_tag *cache_tag, const xmlNodePtr node)
+{
+ ZEND_ASSERT(node != NULL);
+ return !node->doc || !node->doc->_private || php_dom_is_cache_tag_stale_from_doc_ptr(cache_tag, node->doc->_private);
+}
+
+static zend_always_inline void php_dom_mark_cache_tag_up_to_date_from_node(php_libxml_cache_tag *cache_tag, const xmlNodePtr node)
+{
+ ZEND_ASSERT(cache_tag != NULL);
+ if (node->doc && node->doc->_private) {
+ const php_libxml_doc_ptr* doc_ptr = node->doc->_private;
+ cache_tag->modification_nr = doc_ptr->cache_tag.modification_nr;
+ }
+}
+
PHP_MINIT_FUNCTION(dom);
PHP_MSHUTDOWN_FUNCTION(dom);
PHP_MINFO_FUNCTION(dom);
diff --git a/ext/dom/processinginstruction.c b/ext/dom/processinginstruction.c
index 465ecb431e7..c40d24d18ce 100644
--- a/ext/dom/processinginstruction.c
+++ b/ext/dom/processinginstruction.c
@@ -128,6 +128,8 @@ int dom_processinginstruction_data_write(dom_object *obj, zval *newval)
return FAILURE;
}
+ php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
+
xmlNodeSetContentLen(nodep, (xmlChar *) ZSTR_VAL(str), ZSTR_LEN(str) + 1);
zend_string_release_ex(str, 0);
diff --git a/ext/dom/tests/DOMDocument_getElementsByTagName_liveness.phpt b/ext/dom/tests/DOMDocument_getElementsByTagName_liveness.phpt
new file mode 100644
index 00000000000..2b4622d10d3
--- /dev/null
+++ b/ext/dom/tests/DOMDocument_getElementsByTagName_liveness.phpt
@@ -0,0 +1,47 @@
+--TEST--
+DOMDocument::getElementsByTagName() is live
+--EXTENSIONS--
+dom
+--FILE--
+loadXML( ' Hello xinclude: book.xml not found Hello xinclude: book.xml not found
hello
world
'); + +$elements = $doc->getElementsByTagName('p'); +$elements->item(0); // Activate item cache +$doc->loadHTML('A
B
C
'); +var_dump($elements); +var_dump($elements->item(0)->textContent); // First lookup +var_dump($elements->item(2)->textContent); // Uses cache +var_dump($elements->item(1)->textContent); // Does not use cache + +echo "-- Remove cached item test --\n"; + +$doc = new DOMDocument(); +$doc->loadHTML('hello
world
!
'); + +$elements = $doc->getElementsByTagName('p'); +$item = $elements->item(0); // Activate item cache +var_dump($item->textContent); +$item->remove(); +// Now element 0 means "world", and 1 means "!" +unset($item); +$item = $elements->item(1); +var_dump($item->textContent); + +echo "-- Removal of cached item in loop test --\n"; + +$doc = new DOMDocument; +$doc->loadXML( 'hello
world
!
'); + +$elements = $doc->getElementsByTagName('p'); +$item = $elements->item(0); // Activate item cache +var_dump($elements->length); // Length not cached yet, should still compute +$item->remove(); +// Now element 0 means "world", and 1 means "!" +unset($item); +var_dump($elements->length); +$item = $elements->item(1); +var_dump($item->textContent); +$item = $elements->item(1); +var_dump($item->textContent); +$item = $elements->item(0); +var_dump($item->textContent); +$item = $elements->item(1); +var_dump($item->textContent); + +?> +--EXPECT-- +int(3) +int(2) +string(1) "!" +string(1) "!" +string(5) "world" +string(1) "!" diff --git a/ext/dom/tests/DOMDocument_liveness_caching_invalidation.phpt b/ext/dom/tests/DOMDocument_liveness_caching_invalidation.phpt new file mode 100644 index 00000000000..e05bd1ac6f6 --- /dev/null +++ b/ext/dom/tests/DOMDocument_liveness_caching_invalidation.phpt @@ -0,0 +1,43 @@ +--TEST-- +DOMDocument liveness caching invalidation by textContent +--EXTENSIONS-- +dom +--FILE-- +loadXML('