I need to download some big html files (about 300k+). Everything is working fine, the problem is that some of the files comes with Windows-1252/ISO-8859 characters like "á" or "ç" and when I save the document, Go converts it to characters like �. Any idea?
My code as follow:
package main
import (
"io"
"net/http"
"os"
)
func main() {
pagina := "http://www.mypage.com/doc?someparameters=123"
err := DownloadFile("doc.html", pagina)
if err != nil {
panic(err)
}
}
func DownloadFile(filepath string, url string) error {
out, err := os.Create(filepath)
if err != nil {
return err
}
defer out.Close()
resp, err := http.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
_, err = io.Copy(out, resp.Body)
if err != nil {
return err
}
return nil
}
Something like:
import "golang.org/x/text/encoding/charmap"
rdrBody := charmap.Windows1252.NewDecoder().Reader(resp.Body)
_, err = io.Copy(out, rdrBody)
rdrBody := charmap.ISO8859_1.NewDecoder().Reader(resp.Body)
_, err = io.Copy(out, rdrBody)
There are many otherISO8859
encodings in package charmap
.
A simple example of package charmap.ISO8859_1
use:
package main
import (
"fmt"
"io"
"io/ioutil"
"net/http"
"strings"
"golang.org/x/text/encoding/charmap"
)
func main() {
resp, err := http.Get("http://example.com")
if err != nil {
fmt.Println(err)
return
}
defer resp.Body.Close()
rdrBody := io.Reader(resp.Body)
contentType := strings.ToLower(resp.Header.Get("Content-Type"))
if strings.Contains(contentType, "charset=iso-8859-1") {
rdrBody = charmap.ISO8859_1.NewDecoder().Reader(rdrBody)
}
body, err := ioutil.ReadAll(rdrBody)
if err != nil {
fmt.Println(err)
return
}
n := 256
if n > len(body) {
n = len(body)
}
fmt.Println(string(body[:n]))
}