I tried to compute the integral concurrently, but my program ended up being slower than computing the integral with a normal for loop. What am I doing wrong?
package main
import (
"fmt"
"math"
"sync"
"time"
)
type Result struct {
result float64
lock sync.RWMutex
}
var wg sync.WaitGroup
var result Result
func main() {
now := time.Now()
a := 0.0
b := 1.0
n := 100000.0
deltax := (b - a) / n
wg.Add(int(n))
for i := 0.0; i < n; i++ {
go f(a, deltax, i)
}
wg.Wait()
fmt.Println(deltax * result.result)
fmt.Println(time.Now().Sub(now))
}
func f(a float64, deltax float64, i float64) {
fx := math.Sqrt(a + deltax * (i + 0.5))
result.lock.Lock()
result.result += fx
result.lock.Unlock()
wg.Done()
}
Unless the time taken by the activity in the goroutine takes a lot more time than needed to switch contexts, carry out the task and use a mutex to update a value, it would be faster to do it serially.
Take a look at a slightly modified version. All I've done is add a delay of 1 microsecond in the f()
function.
package main
import (
"fmt"
"math"
"sync"
"time"
)
type Result struct {
result float64
lock sync.RWMutex
}
var wg sync.WaitGroup
var result Result
func main() {
fmt.Println("concurrent")
concurrent()
result.result = 0
fmt.Println("serial")
serial()
}
func concurrent() {
now := time.Now()
a := 0.0
b := 1.0
n := 100000.0
deltax := (b - a) / n
wg.Add(int(n))
for i := 0.0; i < n; i++ {
go f(a, deltax, i, true)
}
wg.Wait()
fmt.Println(deltax * result.result)
fmt.Println(time.Now().Sub(now))
}
func serial() {
now := time.Now()
a := 0.0
b := 1.0
n := 100000.0
deltax := (b - a) / n
for i := 0.0; i < n; i++ {
f(a, deltax, i, false)
}
fmt.Println(deltax * result.result)
fmt.Println(time.Now().Sub(now))
}
func f(a, deltax, i float64, concurrent bool) {
time.Sleep(1 * time.Microsecond)
fx := math.Sqrt(a + deltax*(i+0.5))
if concurrent {
result.lock.Lock()
result.result += fx
result.lock.Unlock()
wg.Done()
} else {
result.result += fx
}
}
With the delay, the result was as follows (the concurrent version is much faster):
concurrent
0.6666666685900424
624.914165ms
serial
0.6666666685900422
5.609195767s
Without the delay:
concurrent
0.6666666685900428
50.771275ms
serial
0.6666666685900422
749.166µs
As you can see, the longer it takes to complete a task, the more sense it makes to do it concurrently, if possible.
3- For performance gain, you may divide tasks per CPU cores without using lock sync.RWMutex
:
+30x
Optimizations using channels and runtime.NumCPU()
, this takes 2ms
on 2 Cores and 993µs
on 8 Cores, while Your sample code takes 61ms
on 2 Cores and 40ms
on 8 Cores:
See this working sample code and outputs:
package main
import (
"fmt"
"math"
"runtime"
"time"
)
func main() {
nCPU := runtime.NumCPU()
fmt.Println("nCPU =", nCPU)
ch := make(chan float64, nCPU)
startTime := time.Now()
a := 0.0
b := 1.0
n := 100000.0
deltax := (b - a) / n
stepPerCPU := n / float64(nCPU)
for start := 0.0; start < n; {
stop := start + stepPerCPU
go f(start, stop, a, deltax, ch)
start = stop
}
integral := 0.0
for i := 0; i < nCPU; i++ {
integral += <-ch
}
fmt.Println(time.Now().Sub(startTime))
fmt.Println(deltax * integral)
}
func f(start, stop, a, deltax float64, ch chan float64) {
result := 0.0
for i := start; i < stop; i++ {
result += math.Sqrt(a + deltax*(i+0.5))
}
ch <- result
}
Output on 2 Cores:
nCPU = 2
2.0001ms
0.6666666685900485
Output on 8 Cores:
nCPU = 8
993µs
0.6666666685900456
Your sample code, Output on 2 Cores:
0.6666666685900424
61.0035ms
Your sample code, Output on 8 Cores:
0.6666666685900415
40.9964ms
2- For good benchmark statistics, use large number of samples (big n):
As you See here using 2 Cores this takes 110ms
on 2 Cores, but on this same CPU using 1 Core this takes 215ms
with n := 10000000.0
:
With n := 10000000.0
and single goroutine, see this working sample code:
package main
import (
"fmt"
"math"
"time"
)
func main() {
now := time.Now()
a := 0.0
b := 1.0
n := 10000000.0
deltax := (b - a) / n
result := 0.0
for i := 0.0; i < n; i++ {
result += math.Sqrt(a + deltax*(i+0.5))
}
fmt.Println(time.Now().Sub(now))
fmt.Println(deltax * result)
}
Output:
215.0123ms
0.6666666666685884
With n := 10000000.0
and 2 goroutines, see this working sample code:
package main
import (
"fmt"
"math"
"runtime"
"time"
)
func main() {
nCPU := runtime.NumCPU()
fmt.Println("nCPU =", nCPU)
ch := make(chan float64, nCPU)
startTime := time.Now()
a := 0.0
b := 1.0
n := 10000000.0
deltax := (b - a) / n
stepPerCPU := n / float64(nCPU)
for start := 0.0; start < n; {
stop := start + stepPerCPU
go f(start, stop, a, deltax, ch)
start = stop
}
integral := 0.0
for i := 0; i < nCPU; i++ {
integral += <-ch
}
fmt.Println(time.Now().Sub(startTime))
fmt.Println(deltax * integral)
}
func f(start, stop, a, deltax float64, ch chan float64) {
result := 0.0
for i := start; i < stop; i++ {
result += math.Sqrt(a + deltax*(i+0.5))
}
ch <- result
}
Output:
nCPU = 2
110.0063ms
0.6666666666686073
1- There is an optimum point for number of Goroutines, And from this point forward increasing number of Goroutines doesn't reduce program execution time:
On 2 Cores CPU, with the following code, The result is:
nCPU: 1, 2, 4, 8, 16
Time: 2.1601236s, 1.1220642s, 1.1060633s, 1.1140637s, 1.1380651s
As you see from nCPU=1
to nCPU=2
time decrease is big enough but after this point it is not much, so nCPU=2
on 2 Cores CPU is Optimum point for this Sample code, so using nCPU := runtime.NumCPU()
is enough here.
package main
import (
"fmt"
"math"
"time"
)
func main() {
nCPU := 2 //2.1601236s@1 1.1220642s@2 1.1060633s@4 1.1140637s@8 1.1380651s@16
fmt.Println("nCPU =", nCPU)
ch := make(chan float64, nCPU)
startTime := time.Now()
a := 0.0
b := 1.0
n := 100000000.0
deltax := (b - a) / n
stepPerCPU := n / float64(nCPU)
for start := 0.0; start < n; {
stop := start + stepPerCPU
go f(start, stop, a, deltax, ch)
start = stop
}
integral := 0.0
for i := 0; i < nCPU; i++ {
integral += <-ch
}
fmt.Println(time.Now().Sub(startTime))
fmt.Println(deltax * integral)
}
func f(start, stop, a, deltax float64, ch chan float64) {
result := 0.0
for i := start; i < stop; i++ {
result += math.Sqrt(a + deltax*(i+0.5))
}
ch <- result
}