I need to compute the hash (md5 is ok) for a large number of files. So, in Go I have this code:
package main
import (
"io"
"os"
"fmt"
"path/filepath"
"crypto/md5"
"encoding/hex"
)
func strSliceRemove(slice []string, str string) []string {
var tempSlice []string;
for _, item := range slice {
if item != str {
tempSlice = append(tempSlice, item)
}
}
return tempSlice
}
func fileMD5(path string) (string, error) {
var returnMD5String string
file, err := os.Open(path)
if err != nil {
return returnMD5String, err
}
defer file.Close()
hash := md5.New()
if _, err := io.Copy(hash, file); err != nil {
return returnMD5String, err
}
hashInBytes := hash.Sum(nil)[:16]
returnMD5String = hex.EncodeToString(hashInBytes)
return returnMD5String, nil
}
func main() {
var doRead func(string)
doRead = func(sd string) {
filepath.Walk(sd, func(path string, f os.FileInfo, err error) error {
resolvedPath, resolvedPathErr := filepath.EvalSymlinks(path)
if resolvedPathErr != nil {
return nil
}
if f.Mode()&os.ModeSymlink == os.ModeSymlink {
doRead(resolvedPath)
} else {
if !f.IsDir() {
md5, _ := fileMD5(path)
fmt.Printf("%s
", md5)
}
}
return nil
})
}
doRead("/tmp/electron")
return
}
It hashes correctly 1400 files in almost one second. If I use my OSX md5 command line utility, it takes more than 10 times the time. It is 10 times slower:
for FILE in `find /tmp/electron`; do
if [ ! -d "$FILE" ]; then
md5 $FILE;
fi;
done;
I tried a basic c program that does the same (based on this answer How to calculate the MD5 hash of a large file in C?) and still the time seems more or less 10 seconds.
What kind of strategy / library does crypto/md5 use?