1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
| // This regex will parse the piece of html code to extract the elements
// we need. Note that it is not fully armored against strange or invalid html!
// This a VERY complex recursive regex (took me hours to make it work!).
// So lets detail it!
// Note: About opening elements: when no tag name is given, all opening
// elements are tested. There seems to be a bug with elements
// like <hr /> (returns a white page, without any exception
).
// So I added a negative look-ahead test to exclude these things
// explicitly (even though their matching test should fail nicely!).
// WARNING: This seems to be a VERY sensible regex, prone to strange errors
// very quickly
$regex = '/'
// The initial open element.
. '<(?P<tagname>'.$tagname.')(?! *\/>)'.$attributes.'\s*>'
// Condition: If it is not a closing tag
. '(?P<content>(?(?!<\/(?P=tagname)>)'
// Condition: If it is an opening tag, try the (recursive) sub-pattern matching
. '(?(?=<(?P=tagname)(?: [^>]*|)>)'
// The start of the recursive sub-pattern.
. '(?P<subpattern>'
// A nested element of same kind as first opening one,
. '<(?P=tagname)(?: [^>]*|)>'
// Condition: If it is not a closing tag
. '(?(?!<\/(?P=tagname)>)'
// Condition: If it is an opening one, try recursive pattern
. '(?(?=<(?P=tagname)(?: [^>]*|)>)(?P>subpattern)'
// Else, consume as much chars as possible, until the next
// opening or closing tag
. '|((?:.(?!<(?P=tagname)(?: [^>]*|)>)(?!<\/(?P=tagname)>))*.))'
// And retest the extern condition.
. ')*'
// The matching closing tag!
. '<\/(?P=tagname)>'//.*?'
// End of subpattern.
. ')'
// Else, consume as much chars as possible, until the next
// opening or closing tag.
. '|((?:.(?!<(?P=tagname)(?: [^>]*|)>)(?!<\/(?P=tagname)>))*.))'
// And retest the whole extern condition.
. ')*)'
// Out-most closing tag.
. '<\/(?P=tagname)>'
. '/s'; |