Is there a well known client for the Go programming language that supports Hadoop Streaming? I have searched around and was unable to find anything of value.
You could run your Hadoop streaming jobs directly on Go, I've heard of people doing it and here is an example taken from a blog that does Wordcount in Go. Here is the mapper:
package main
import (
"bufio"
"fmt"
"os"
"regexp"
)
func main() {
/* Word regular experssion. */
re, _ := regexp.Compile("[a-zA-Z0-9]+")
reader := bufio.NewReader(os.Stdin)
for {
line, _, err := reader.ReadLine()
if err != nil {
if err != os.EOF {
fmt.Fprintf(os.Stderr, "error: can't read - %s
", err.String())
}
break
}
matches := re.FindAll(line, -1)
for _, word := range(matches) {
fmt.Printf("%s\t1
", word)
}
}
}
And here is the reducer:
package main
import (
"bufio"
"bytes"
"fmt"
"os"
"strconv"
)
func main() {
counts := make(map[string]uint)
reader := bufio.NewReader(os.Stdin)
for {
line, _, err := reader.ReadLine()
if err != nil {
if err != os.EOF {
fmt.Fprintf(os.Stderr, "error: can't read - %s
", err)
}
break
}
i := bytes.IndexByte(line, '\t')
if i == -1 {
fmt.Fprintln(os.Stderr, "error: can't find tab")
continue
}
word := string(line[0:i])
count, err := strconv.Atoui(string(line[i+1:]))
if err != nil {
fmt.Fprintln(os.Stderr, "error: bad number - %s
", err)
continue
}
counts[word] = counts[word] + count
}
/* Output aggregated counts. */
for word, count := range(counts) {
fmt.Printf("%s\t%d
", word, count)
}
}
Alternatively, you could also use dmrgo to make it easier to write your streaming jobs. They have a wordcount example available here.
I saw another library called gomrjob but it doesn't look very well maintained and very alpha, but you could give it a try if you feel adventurous :)