为什么goroutines这么慢?

I posted a question with nearly the same code yesterday, asking how to make this concurrent amongst a variadic function. After it was resolved, I expected the program to run nearly the same amount of time with one generator as with 30+. It doesn't seem so.

The times I see are with one generator, about 5ms. With what's in the code below, 150ms. (For some reason, play.golang shows 0).

Why is it slower? My expectation was that, with the multiple goroutines, it would take about as long. Something to do with spinning up the goroutines?

package main

import (
    "fmt"
    "sync"
    "time"
)

func main() {
    t := time.Now()
    _ = fanIn(
        generator(4, 5, 6, 7),
        generator(1, 2, 6, 3, 7),
        generator(12, 15, 33, 40, 10),
        generator(18, 13, 20, 40, 15),
        generator(100, 200, 64000, 3121, 1237),
        generator(4, 5, 6, 7),
        generator(1, 2, 6, 3, 7),
        generator(12, 15, 33, 40, 10),
        generator(18, 13, 20, 40, 15),
        generator(100, 200, 64000, 3121, 1237),
        generator(4, 5, 6, 7),
        generator(1, 2, 6, 3, 7),
        generator(12, 15, 33, 40, 10),
        generator(18, 13, 20, 40, 15),
        generator(100, 200, 64000, 3121, 1237),
        generator(4, 5, 6, 7),
        generator(1, 2, 6, 3, 7),
        generator(12, 15, 33, 40, 10),
        generator(18, 13, 20, 40, 15),
        generator(100, 200, 64000, 3121, 1237),
        generator(4, 5, 6, 7),
        generator(1, 2, 6, 3, 7),
        generator(12, 15, 33, 40, 10),
        generator(18, 13, 20, 40, 15),
        generator(100, 200, 64000, 3121, 1237),
        generator(4, 5, 6, 7),
        generator(1, 2, 6, 3, 7),
        generator(12, 15, 33, 40, 10),
        generator(18, 13, 20, 40, 15),
        generator(100, 200, 64000, 3121, 1237),
        generator(4, 5, 6, 7),
        generator(1, 2, 6, 3, 7),
        generator(12, 15, 33, 40, 10),
        generator(18, 13, 20, 40, 15),
        generator(100, 200, 64000, 3121, 1237),
    )

    fmt.Println(time.Now().Sub(t))
}

func generator(nums ...int) <-chan int {
    out := make(chan int, 10)
    go func() {
        defer close(out)
        for _, v := range nums {
            out <- v
        }
    }()
    return out
}

func fanIn(in ...<-chan int) <-chan int {
    var wg sync.WaitGroup
    out := make(chan int, 10)
    wg.Add(len(in))

    go func() {
        for _, v := range in {
            go func(ch <-chan int) {
                defer wg.Done()
                for val := range ch {
                    out <- val
                }
            }(v)
        }

    }()
    go func() {
        wg.Wait()
        close(out)
    }()
    return out
}

There is a little difference between go run and go build (compile time):
for me 17ms (on 2 Cores) and 3ms (on 8 Cores) with go1.7 amd64:

difference between go run and go build:
951.0543ms-934.0535ms = 17.0008ms (on 2 Cores)
575.3447ms-572.3914ms = 2.9533ms (on 8 Cores)

difference between 8 Cores and 2 Cores with go build:
934.0535ms-572.3914ms = 361.6621ms

For good benchmark statistics, use large number of samples.
try update to latest Go version ( 1.7).

Try this working sample code, and compare your result with these outputs:

package main

import (
    "fmt"
    "math/rand"
    "sync"
    "time"
)

func main() {
    t := time.Now()
    cs := make([]<-chan int, 1000)
    for i := 0; i < len(cs); i++ {
        cs[i] = generator(rand.Perm(10000)...)
    }
    ch := fanIn(cs...)
    fmt.Println(time.Now().Sub(t))

    is := make([]int, 0, len(ch))
    for v := range ch {
        is = append(is, v)
    }
    fmt.Println("len=", len(is))
}

func generator(nums ...int) <-chan int {
    out := make(chan int, len(nums))
    go func() {
        defer close(out)
        for _, v := range nums {
            out <- v
        }
    }()
    return out
}

func fanIn(in ...<-chan int) <-chan int {
    var wg sync.WaitGroup
    out := make(chan int, 10)
    wg.Add(len(in))

    go func() {
        for _, v := range in {
            go func(ch <-chan int) {
                defer wg.Done()
                for val := range ch {
                    out <- val
                }
            }(v)
        }

    }()
    go func() {
        wg.Wait()
        close(out)
    }()
    return out
}

output with 2 Cores ( with go run):

951.0543ms
len= 10000000

output with 2 Cores ( with go build):

934.0535ms
len= 10000000

output with 8 Cores ( with go run):

575.3447ms
len= 10000000

output with 8 Cores ( with go build):

572.3914ms
len= 10000000