Golang中的并发积分计算

I tried to compute the integral concurrently, but my program ended up being slower than computing the integral with a normal for loop. What am I doing wrong?

package main

import (
    "fmt"
    "math"
    "sync"
    "time"
)

type Result struct {
    result float64
    lock sync.RWMutex
}

var wg sync.WaitGroup
var result Result

func main() {
    now := time.Now()
    a := 0.0
    b := 1.0
    n := 100000.0
    deltax := (b - a) / n
    wg.Add(int(n))
    for i := 0.0; i < n; i++ {
        go f(a, deltax, i)
    }
    wg.Wait()
    fmt.Println(deltax * result.result)
    fmt.Println(time.Now().Sub(now))
}

func f(a float64, deltax float64, i float64) {
    fx := math.Sqrt(a + deltax * (i + 0.5))
    result.lock.Lock()
    result.result += fx
    result.lock.Unlock()
    wg.Done()
}

Unless the time taken by the activity in the goroutine takes a lot more time than needed to switch contexts, carry out the task and use a mutex to update a value, it would be faster to do it serially.

Take a look at a slightly modified version. All I've done is add a delay of 1 microsecond in the f() function.

package main

import (
    "fmt"
    "math"
    "sync"
    "time"
)

type Result struct {
    result float64
    lock   sync.RWMutex
}

var wg sync.WaitGroup
var result Result

func main() {
    fmt.Println("concurrent")
    concurrent()
    result.result = 0
    fmt.Println("serial")
    serial()
}

func concurrent() {
    now := time.Now()
    a := 0.0
    b := 1.0
    n := 100000.0
    deltax := (b - a) / n
    wg.Add(int(n))
    for i := 0.0; i < n; i++ {
        go f(a, deltax, i, true)
    }
    wg.Wait()
    fmt.Println(deltax * result.result)
    fmt.Println(time.Now().Sub(now))
}

func serial() {
    now := time.Now()
    a := 0.0
    b := 1.0
    n := 100000.0
    deltax := (b - a) / n
    for i := 0.0; i < n; i++ {
        f(a, deltax, i, false)
    }
    fmt.Println(deltax * result.result)
    fmt.Println(time.Now().Sub(now))
}

func f(a, deltax, i float64, concurrent bool) {
    time.Sleep(1 * time.Microsecond)
    fx := math.Sqrt(a + deltax*(i+0.5))
    if concurrent {
        result.lock.Lock()
        result.result += fx
        result.lock.Unlock()
        wg.Done()
    } else {
        result.result += fx
    }
}

With the delay, the result was as follows (the concurrent version is much faster):

concurrent
0.6666666685900424
624.914165ms

serial
0.6666666685900422
5.609195767s

Without the delay:

concurrent
0.6666666685900428
50.771275ms

serial
0.6666666685900422
749.166µs

As you can see, the longer it takes to complete a task, the more sense it makes to do it concurrently, if possible.

3- For performance gain, you may divide tasks per CPU cores without using lock sync.RWMutex:

+30x Optimizations using channels and runtime.NumCPU(), this takes 2ms on 2 Cores and 993µs on 8 Cores, while Your sample code takes 61ms on 2 Cores and 40ms on 8 Cores:

See this working sample code and outputs:

package main

import (
    "fmt"
    "math"
    "runtime"
    "time"
)

func main() {
    nCPU := runtime.NumCPU()
    fmt.Println("nCPU =", nCPU)
    ch := make(chan float64, nCPU)
    startTime := time.Now()
    a := 0.0
    b := 1.0
    n := 100000.0 
    deltax := (b - a) / n

    stepPerCPU := n / float64(nCPU)
    for start := 0.0; start < n; {
        stop := start + stepPerCPU
        go f(start, stop, a, deltax, ch)
        start = stop
    }

    integral := 0.0
    for i := 0; i < nCPU; i++ {
        integral += <-ch
    }

    fmt.Println(time.Now().Sub(startTime))
    fmt.Println(deltax * integral)
}

func f(start, stop, a, deltax float64, ch chan float64) {
    result := 0.0
    for i := start; i < stop; i++ {
        result += math.Sqrt(a + deltax*(i+0.5))
    }
    ch <- result
}

Output on 2 Cores:

nCPU = 2
2.0001ms
0.6666666685900485

Output on 8 Cores:

nCPU = 8
993µs
0.6666666685900456

Your sample code, Output on 2 Cores:

0.6666666685900424
61.0035ms

Your sample code, Output on 8 Cores:

0.6666666685900415
40.9964ms

2- For good benchmark statistics, use large number of samples (big n):

As you See here using 2 Cores this takes 110ms on 2 Cores, but on this same CPU using 1 Core this takes 215ms with n := 10000000.0:

With n := 10000000.0 and single goroutine, see this working sample code:

package main

import (
    "fmt"
    "math"
    "time"
)

func main() {
    now := time.Now()
    a := 0.0
    b := 1.0
    n := 10000000.0
    deltax := (b - a) / n
    result := 0.0
    for i := 0.0; i < n; i++ {
        result += math.Sqrt(a + deltax*(i+0.5))
    }
    fmt.Println(time.Now().Sub(now))
    fmt.Println(deltax * result)
}

Output:

215.0123ms
0.6666666666685884

With n := 10000000.0 and 2 goroutines, see this working sample code:

package main

import (
    "fmt"
    "math"
    "runtime"
    "time"
)

func main() {
    nCPU := runtime.NumCPU()
    fmt.Println("nCPU =", nCPU)
    ch := make(chan float64, nCPU)
    startTime := time.Now()
    a := 0.0
    b := 1.0
    n := 10000000.0
    deltax := (b - a) / n

    stepPerCPU := n / float64(nCPU)
    for start := 0.0; start < n; {
        stop := start + stepPerCPU
        go f(start, stop, a, deltax, ch)
        start = stop
    }

    integral := 0.0
    for i := 0; i < nCPU; i++ {
        integral += <-ch
    }

    fmt.Println(time.Now().Sub(startTime))
    fmt.Println(deltax * integral)
}

func f(start, stop, a, deltax float64, ch chan float64) {
    result := 0.0
    for i := start; i < stop; i++ {
        result += math.Sqrt(a + deltax*(i+0.5))
    }
    ch <- result
}

Output:

nCPU = 2
110.0063ms
0.6666666666686073

1- There is an optimum point for number of Goroutines, And from this point forward increasing number of Goroutines doesn't reduce program execution time:

On 2 Cores CPU, with the following code, The result is:

nCPU: 1,          2,          4,         8,           16
Time: 2.1601236s, 1.1220642s, 1.1060633s, 1.1140637s, 1.1380651s

As you see from nCPU=1 to nCPU=2 time decrease is big enough but after this point it is not much, so nCPU=2 on 2 Cores CPU is Optimum point for this Sample code, so using nCPU := runtime.NumCPU() is enough here.

package main

import (
    "fmt"
    "math"
    "time"
)

func main() {
    nCPU := 2 //2.1601236s@1 1.1220642s@2 1.1060633s@4 1.1140637s@8 1.1380651s@16
    fmt.Println("nCPU =", nCPU)
    ch := make(chan float64, nCPU)
    startTime := time.Now()
    a := 0.0
    b := 1.0
    n := 100000000.0
    deltax := (b - a) / n

    stepPerCPU := n / float64(nCPU)
    for start := 0.0; start < n; {
        stop := start + stepPerCPU
        go f(start, stop, a, deltax, ch)
        start = stop
    }

    integral := 0.0
    for i := 0; i < nCPU; i++ {
        integral += <-ch
    }

    fmt.Println(time.Now().Sub(startTime))
    fmt.Println(deltax * integral)
}

func f(start, stop, a, deltax float64, ch chan float64) {
    result := 0.0
    for i := start; i < stop; i++ {
        result += math.Sqrt(a + deltax*(i+0.5))
    }
    ch <- result
}