在Golang中提取* html.Node的位置偏移

How do I can extract positional offset for specific node of already parsed HTML document? For example, for document <div>Hello, <b>World!</b></div> I want to be able to know that offset of World! is 15:21. Document may be changed while parsing.

I have a solution to render whole document with special marks, but it's really bad for performance. Any ideas?

package main

import (
    "bytes"
    "golang.org/x/net/html"
    "golang.org/x/net/html/atom"
    "log"
    "strings"
)

func nodeIndexOffset(context *html.Node, node *html.Node) (int, int) {
    if node.Type != html.TextNode {
        node = node.FirstChild
    }
    originalData := node.Data

    var buf bytes.Buffer
    node.Data = "|start|" + originalData
    _ = html.Render(&buf, context.FirstChild)
    start := strings.Index(buf.String(), "|start|")

    buf = bytes.Buffer{}
    node.Data = originalData + "|end|"
    _ = html.Render(&buf, context.FirstChild)
    end := strings.Index(buf.String(), "|end|")

    node.Data = originalData
    return start, end
}

func main() {
    s := "<div>Hello, <b>World!</b></div>"
    var context html.Node
    context = html.Node{
        Type:     html.ElementNode,
        Data:     "body",
        DataAtom: atom.Body,
    }
    nodes, err := html.ParseFragment(strings.NewReader(s), &context)
    if err != nil {
        log.Fatal(err)
    }
    for _, node := range nodes {
        context.AppendChild(node)
    }
    world := nodes[0].FirstChild.NextSibling.FirstChild
    log.Println("target", world)
    log.Println(nodeIndexOffset(&context, world))
}

I come up with solution where we extend (please fix me if there's another way to do it) original HTML package with additional custom.go file with new exported function. This function is able to access unexported data property of Tokenizer, which holds exactly start and end position of current Node. We have to adjust positions after each buffer read. See globalBufDif.

I don't really like that I have to fork the package only to access couple of properties, but seems like this is a Go way.

func parseWithIndexes(p *parser) (map[*Node][2]int, error) {
    // Iterate until EOF. Any other error will cause an early return.
    var err error
    var globalBufDif int
    var prevEndBuf int
    var tokenIndex [2]int
    tokenMap := make(map[*Node][2]int)
    for err != io.EOF {
        // CDATA sections are allowed only in foreign content.
        n := p.oe.top()
        p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")

        t := p.top().FirstChild
        for {
            if t != nil && t.NextSibling != nil {
                t = t.NextSibling
            } else {
                break
            }
        }
        tokenMap[t] = tokenIndex
        if prevEndBuf > p.tokenizer.data.end {
            globalBufDif += prevEndBuf
        }
        prevEndBuf = p.tokenizer.data.end
        // Read and parse the next token.
        p.tokenizer.Next()
        tokenIndex = [2]int{p.tokenizer.data.start + globalBufDif, p.tokenizer.data.end + globalBufDif}

        p.tok = p.tokenizer.Token()
        if p.tok.Type == ErrorToken {
            err = p.tokenizer.Err()
            if err != nil && err != io.EOF {
                return tokenMap, err
            }
        }
        p.parseCurrentToken()
    }
    return tokenMap, nil
}

// ParseFragmentWithIndexes parses a fragment of HTML and returns the nodes
// that were found. If the fragment is the InnerHTML for an existing element,
// pass that element in context.
func ParseFragmentWithIndexes(r io.Reader, context *Node) ([]*Node, map[*Node][2]int, error) {
    contextTag := ""
    if context != nil {
        if context.Type != ElementNode {
            return nil, nil, errors.New("html: ParseFragment of non-element Node")
        }
        // The next check isn't just context.DataAtom.String() == context.Data because
        // it is valid to pass an element whose tag isn't a known atom. For example,
        // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
        if context.DataAtom != a.Lookup([]byte(context.Data)) {
            return nil, nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
        }
        contextTag = context.DataAtom.String()
    }
    p := &parser{
        tokenizer: NewTokenizerFragment(r, contextTag),
        doc: &Node{
            Type: DocumentNode,
        },
        scripting: true,
        fragment:  true,
        context:   context,
    }

    root := &Node{
        Type:     ElementNode,
        DataAtom: a.Html,
        Data:     a.Html.String(),
    }
    p.doc.AppendChild(root)
    p.oe = nodeStack{root}
    p.resetInsertionMode()

    for n := context; n != nil; n = n.Parent {
        if n.Type == ElementNode && n.DataAtom == a.Form {
            p.form = n
            break
        }
    }

    tokenMap, err := parseWithIndexes(p)
    if err != nil {
        return nil, nil, err
    }

    parent := p.doc
    if context != nil {
        parent = root
    }

    var result []*Node
    for c := parent.FirstChild; c != nil; {
        next := c.NextSibling
        parent.RemoveChild(c)
        result = append(result, c)
        c = next
    }
    return result, tokenMap, nil
}

Not an answer, but too long for a comment. The following could work to some extent:

  • Use a Tokenizer and step through each element one by one.
  • Wrap your input into a custom reader which records lines and column offsets as the Tokenizer reads from it.
  • Query your custom reader for the position before and after calling Next() to record the approximate position information you need.

This is a bit painful and not too accurate but probably the best you could do.