转到XML-解析HTML中的布尔值属性会导致XML验证错误

I have an html output with the following tag.

<hr noshade>

My struct for this is

type Hr struct {
    TagName xml.Name `xml:"hr"`
}

When I try to pass the html using "encoding/xml", it throws an error saying the attribute doesn't have a '=' character.

I've seen that this error is thrown because the default Decoder evaluates XML with Strict set to true.

How can I ignore this and continue parsing the document (using xml.Unmarshal())?

EDIT: Including the XML and the structs used.

I found out the Decoder settings, and used NewDecoder, however it seems the unmarshalling doesn't happen properly.

<html><head><title>Some title</title></head>
<body>
 <h2>Title here</h2>
 <ul>
  <li><a href="../">..</a></li>
  <li><a href="file1.txt">file1.txt</a></li>
  <li><a href="file2.zip">file2.zip</a></li>
  .....
 </ul>
 <hr noshade><em>Powered by <a href="http://subversion.apache.org/">Apache Subversion</a> version 1.7.18 (r1615261).</em>
</body></html>

Code I've written so far

type Anchor struct {
    TagName xml.Name `xml:"a"`
    Href    string   `xml:"href,attr"`
}

type ListEntry struct {
    TagName  xml.Name `xml:"li"`
    Filename Anchor
}

type DirList struct {
    XMLName xml.Name `xml:"ul"`
    Entries []ListEntry
}

type Header struct {
    TagName xml.Name `xml:"h2"`
}

type Head struct {
    TagName xml.Name `xml:"head"`
    title   Title
}

type Title struct {
    TagName xml.Name `xml:"title"`
}

type html struct {
    TagName xml.Name `xml:"html"`
    body    Body     `xml:"body"`
    head    Head
}

type Body struct {
    H2            Header
    DirectoryList DirList
    hr            Hr
    em            Em
}

type Hr struct {
    TagName xml.Name `xml:"hr"`
}

type Em struct {
    TagName xml.Name `xml:"em"`
    link    Anchor
}

   contents := retrieveFromWeb()

    htmlTag := html{}
    decoder := xml.NewDecoder(strings.NewReader(contents))
    decoder.Strict = false
    decoder.AutoClose = xml.HTMLAutoClose
    decoder.Entity = xml.HTMLEntity

    err = decoder.Decode(&htmlTag)

    fmt.Println("DirList: ", htmlTag)

Current output

DirList:  {{ } {{{ }} {{ } []} {{ }} {{ } {{ } }}} {{ } {{ }}}}

You can use Decoder to unmarshal. Using decoder you can turn off the strict parsing and overcome the error that you are facing. Since you had put only one line of xml/html to parse I have assumed root element and some value between hr tag and below is the sample implementation

package main

import (
    "encoding/xml"
    "fmt"
    "strings"
)

type Hr struct {
    XMLName xml.Name `xml:"a"`
    TagName string   `xml:"hr"`
}

func main() {   
    s := "<a><hr noshade>value</hr></a>"

    hr := &Hr{}
    d := xml.NewDecoder(strings.NewReader(s))
    d.Strict = false
    err := d.Decode(hr)
    if err != nil {
        panic(err)
    }

    fmt.Println(hr.TagName)
}

fmt.Println(hr.TagName) will print "value"

There are many errors in your code:

  • If the attribute is not public, it cannot be accessed by another package (xml in this case): Make all the attributes upper case.
  • li was missing the tag name.

See this working code

http://play.golang.org/p/rkNf2OfvdM

package main

import (
    "encoding/xml"
    "fmt"
    "log"
    "strings"
)

type Anchor struct {
    XMLName xml.Name `xml:"a"`
    Href    string   `xml:"href,attr"`
}

type ListEntry struct {
    XMLName xml.Name `xml:"li"`
        Filename Anchor
}

type DirList struct {
    XMLName xml.Name    `xml:"ul"`
    Entries []ListEntry `xml:"li"`
}

type Header struct {
    XMLName xml.Name `xml:"h2"`
}

type Head struct {
    XMLName xml.Name `xml:"head"`
    Title   Title
}

type Title struct {
    XMLName xml.Name `xml:"title"`
}

type Html struct {
    XMLName xml.Name `xml:"html"`
    Body    Body     `xml:"body"`
    Head    Head
}

type Body struct {
    H2            Header
    DirectoryList DirList
    Hr            Hr
    Em            Em
}

type Hr struct {
    XMLName xml.Name `xml:"hr"`
}

type Em struct {
    XMLName xml.Name `xml:"em"`
    link    Anchor
}

var contents = `<html><head><title>Some title</title></head>
<body>
 <h2>Title here</h2>
 <ul>
  <li><a href="../">..</a></li>
  <li><a href="file1.txt">file1.txt</a></li>
  <li><a href="file2.zip">file2.zip</a></li>
 </ul>
 <hr noshade><em>Powered by <a href="http://subversion.apache.org/">Apache Subversion</a> version 1.7.18 (r1615261).</em>
</body></html>`

func main() {
    htmlTag := Html{}
    decoder := xml.NewDecoder(strings.NewReader(contents))
    decoder.Strict = false
    decoder.AutoClose = xml.HTMLAutoClose
    decoder.Entity = xml.HTMLEntity

    err := decoder.Decode(&htmlTag)
    if err != nil {
        log.Fatal(err)
    }

    fmt.Printf("DirList: %v %#[1]v
", htmlTag)
}