How do I can extract positional offset for specific node of already parsed HTML document? For example, for document <div>Hello, <b>World!</b></div>
I want to be able to know that offset of World!
is 15:21
. Document may be changed while parsing.
I have a solution to render whole document with special marks, but it's really bad for performance. Any ideas?
package main
import (
"bytes"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"log"
"strings"
)
func nodeIndexOffset(context *html.Node, node *html.Node) (int, int) {
if node.Type != html.TextNode {
node = node.FirstChild
}
originalData := node.Data
var buf bytes.Buffer
node.Data = "|start|" + originalData
_ = html.Render(&buf, context.FirstChild)
start := strings.Index(buf.String(), "|start|")
buf = bytes.Buffer{}
node.Data = originalData + "|end|"
_ = html.Render(&buf, context.FirstChild)
end := strings.Index(buf.String(), "|end|")
node.Data = originalData
return start, end
}
func main() {
s := "<div>Hello, <b>World!</b></div>"
var context html.Node
context = html.Node{
Type: html.ElementNode,
Data: "body",
DataAtom: atom.Body,
}
nodes, err := html.ParseFragment(strings.NewReader(s), &context)
if err != nil {
log.Fatal(err)
}
for _, node := range nodes {
context.AppendChild(node)
}
world := nodes[0].FirstChild.NextSibling.FirstChild
log.Println("target", world)
log.Println(nodeIndexOffset(&context, world))
}
I come up with solution where we extend (please fix me if there's another way to do it) original HTML package with additional custom.go
file with new exported function. This function is able to access unexported data
property of Tokenizer
, which holds exactly start and end position of current Node
. We have to adjust positions after each buffer read. See globalBufDif
.
I don't really like that I have to fork the package only to access couple of properties, but seems like this is a Go way.
func parseWithIndexes(p *parser) (map[*Node][2]int, error) {
// Iterate until EOF. Any other error will cause an early return.
var err error
var globalBufDif int
var prevEndBuf int
var tokenIndex [2]int
tokenMap := make(map[*Node][2]int)
for err != io.EOF {
// CDATA sections are allowed only in foreign content.
n := p.oe.top()
p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
t := p.top().FirstChild
for {
if t != nil && t.NextSibling != nil {
t = t.NextSibling
} else {
break
}
}
tokenMap[t] = tokenIndex
if prevEndBuf > p.tokenizer.data.end {
globalBufDif += prevEndBuf
}
prevEndBuf = p.tokenizer.data.end
// Read and parse the next token.
p.tokenizer.Next()
tokenIndex = [2]int{p.tokenizer.data.start + globalBufDif, p.tokenizer.data.end + globalBufDif}
p.tok = p.tokenizer.Token()
if p.tok.Type == ErrorToken {
err = p.tokenizer.Err()
if err != nil && err != io.EOF {
return tokenMap, err
}
}
p.parseCurrentToken()
}
return tokenMap, nil
}
// ParseFragmentWithIndexes parses a fragment of HTML and returns the nodes
// that were found. If the fragment is the InnerHTML for an existing element,
// pass that element in context.
func ParseFragmentWithIndexes(r io.Reader, context *Node) ([]*Node, map[*Node][2]int, error) {
contextTag := ""
if context != nil {
if context.Type != ElementNode {
return nil, nil, errors.New("html: ParseFragment of non-element Node")
}
// The next check isn't just context.DataAtom.String() == context.Data because
// it is valid to pass an element whose tag isn't a known atom. For example,
// DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
if context.DataAtom != a.Lookup([]byte(context.Data)) {
return nil, nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
}
contextTag = context.DataAtom.String()
}
p := &parser{
tokenizer: NewTokenizerFragment(r, contextTag),
doc: &Node{
Type: DocumentNode,
},
scripting: true,
fragment: true,
context: context,
}
root := &Node{
Type: ElementNode,
DataAtom: a.Html,
Data: a.Html.String(),
}
p.doc.AppendChild(root)
p.oe = nodeStack{root}
p.resetInsertionMode()
for n := context; n != nil; n = n.Parent {
if n.Type == ElementNode && n.DataAtom == a.Form {
p.form = n
break
}
}
tokenMap, err := parseWithIndexes(p)
if err != nil {
return nil, nil, err
}
parent := p.doc
if context != nil {
parent = root
}
var result []*Node
for c := parent.FirstChild; c != nil; {
next := c.NextSibling
parent.RemoveChild(c)
result = append(result, c)
c = next
}
return result, tokenMap, nil
}
Not an answer, but too long for a comment. The following could work to some extent:
Tokenizer
and step through each element one by one.This is a bit painful and not too accurate but probably the best you could do.