在Go中解析HTML输入标签

The Go HTML parsing package ignores input tags and interprets them as text content of the form tag. What is the best option to bypass this limitation?

package main

import (
    "fmt"
    "strings"

    "golang.org/x/net/html"
)

const HTML = ` 
<!DOCTYPE html>
<html lang="en">
     <head>
        <meta charset="utf-8"/>
        <title>selected attribute</title>
    </head>
    <body>
        <form method="GET">
            <input type="submit" value="submit"/>
        </form>
    </body>
</html>
`

func main() {
    z := html.NewTokenizer(strings.NewReader(HTML))
    tt := html.TokenType(7)
    for tt != html.ErrorToken {
        tt = z.Next()
        if tt == html.StartTagToken {
            name, _ := z.TagName()
            fmt.Println(string(name))
        }
    }
}

ignores input tags and interprets them as text content of the form tag

Your premise is wrong, it doesn't ignore input tags, as demonstrated below:

package main

import (
    "fmt"
    "golang.org/x/net/html"
    "strings"
)

const HTML = ` 
<!DOCTYPE html>
<html lang="en">
     <head>
        <meta charset="utf-8"/>
        <title>selected attribute</title>
    </head>
    <body>
        <form method="GET">
            <input type="submit" value="submit"/>
        </form>
    </body>
</html>
`

func main() {
    z, _ := html.Parse(strings.NewReader(HTML))
    var f func(*html.Node)
    f = func(n *html.Node) {
        if n.Type == html.ElementNode && n.Data == "input" {
            for _, a := range n.Attr {
                if a.Key == "value" {
                    fmt.Println(a.Val)
                    break
                }
            }
        }
        for c := n.FirstChild; c != nil; c = c.NextSibling {
            f(c)
        }
    }
    f(z)
}

This will print out "submit", the value of the input tag.

One should use html.SelfClosingTagToken instead of html.StartTagToken for parsing input tags.