通过递归调用时,例程未运行

I'm doing the Web Crawler problem from the tour of go. Here's my solution so far:

func GatherUrls(url string, fetcher Fetcher) []string {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q
", url, body)
    }
    return urls
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q...
", url)
    if depth <= 0 {
        return
    }
    urls := GatherUrls(url, fetcher)
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q
", u)
            go Crawl(u, depth - 1, fetcher)
        } else {
            cache[u] = true
        }
    }
}

func main() {
    cache = make(map[string]bool)
    Crawl("https://golang.org/", 4, fetcher)
}

When I run this code, Crawl() is never called when the function recurses (i know this because fmt.Printf("crawling %q... ", url) is only ever called once)

Here are the logs:

crawling "https://golang.org/"...
found: https://golang.org/ "The Go Programming Language"
urls: [https://golang.org/pkg/ https://golang.org/cmd/]
currentUrl: https://golang.org/pkg/
about to crawl "https://golang.org/pkg/"
currentUrl: https://golang.org/cmd/
about to crawl "https://golang.org/cmd/"

What am I doing wrong? I suspect that spawning a thread to do recursion is the wrong way to do this? Please advise.

Please note that I want to do this with as few libraries as possible. I've seen some answers with the WaitGroup package. I dont want to use this.

NOTE: The full code including the lesson boilerplate is below: package main

import (
    "fmt"
)

var cache map[string]bool

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

func GatherUrls(url string, fetcher Fetcher) []string {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q
", url, body)
    }
    return urls
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q...
", url)
    if depth <= 0 {
        return
    }
    urls := GatherUrls(url, fetcher)
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q
", u)
            go Crawl(u, depth - 1, fetcher)
        } else {
            cache[u] = true
        }
    }
}

func main() {
    cache = make(map[string]bool)
    Crawl("https://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}

As you see in this sample: https://tour.golang.org/concurrency/10, we should do following tasks:

  • Fetch URLs in parallel.
  • Don't fetch the same URL twice.
  • Cache URLs already fetched on a map, but maps alone are not safe for concurrent use!

So, we can do following steps to resolve above tasks:

Create struct to store the fetch result:

type Result struct {
    body string
    urls []string
    err  error
}

Create a struct to store URL has already fetched on the map, we need use sync.Mutex, this is not introduced in 'A Tour of Go':

type Cache struct {
    store map[string]bool
    mux   sync.Mutex
}

Fetch URL and body in parallel: Add URL to the cache when fetching it, but the first we need lock read/write in parallel by a mutex. So, we can modify Crawl function like this:

func Crawl(url string, depth int, fetcher Fetcher) {
    if depth <= 0 {
        return
    }

    ch := make(chan Result)

    go func(url string, res chan Result) {
        body, urls, err := fetcher.Fetch(url)

        if err != nil {
            ch <- Result{body, urls, err}
            return
        }

        var furls []string
        cache.mux.Lock()
        for _, u := range urls {
            if _, exists := cache.store[u]; !exists {
                furls = append(furls, u)
            }
            cache.store[u] = true
        }
        cache.mux.Unlock()

        ch <- Result{body: body, urls: furls, err: err}

    }(url, ch)

    res := <-ch

    if res.err != nil {
        fmt.Println(res.err)
        return
    }

    fmt.Printf("found: %s %q
", url, res.body)

    for _, u := range res.urls {
        Crawl(u, depth-1, fetcher)
    }
}

You can view the full code and run this in the playground: https://play.golang.org/p/iY9uBXchx3w

Hope this help.

The main() function exits before the goroutines execute. Fix by using a wait group:

There's a data race on cache. Protect it with a mutex. Always set cache[u] = true for URLs to be visited.

var wg sync.WaitGroup
var mu sync.Mutex
var fetched = map[string]bool{}

func Crawl(url string, depth int, fetcher Fetcher) {
    if depth <= 0 {
        return
    }
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q
", url, body)
    for _, u := range urls {
        mu.Lock()
        f := fetched[u]
        fetched[u] = true
        mu.Unlock()
        if !f {
            wg.Add(1)
            go func(u string) {
                defer wg.Done()
                Crawl(u, depth-1, fetcher)
            }(u)
        }
    }
    return
}

playground example

Wait groups are the idiomatic way to wait for goroutines to complete. If you cannot use sync.WaitGroup for some reason, then reimplement the type using a counter, mutex and channel:

type WaitGroup struct {
    mu   sync.Mutex
    n    int
    done chan struct{}
}

func (wg *WaitGroup) Add(i int) {
    wg.mu.Lock()
    defer wg.mu.Unlock()
    if wg.done == nil {
        wg.done = make(chan struct{})
    }
    wg.n += i
    if wg.n < 0 {
        panic("negative count")
    }
    if wg.n == 0 {
        close(wg.done)
        wg.done = nil
    }
}

func (wg *WaitGroup) Done() {
    wg.Add(-1)
}

func (wg *WaitGroup) Wait() {
    wg.mu.Lock()
    done := wg.done
    wg.mu.Unlock()
    if done != nil {
        <-done
    }
}

playground example

because main function was exit

you need add sync.WaitGroup to keep main function wait unit all coroutine done

package main

import (
    "fmt"
    "sync"
)

var cache map[string]bool

var wg sync.WaitGroup

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

func GatherUrls(url string, fetcher Fetcher, Urls chan []string) {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q
", url, body)
    }
    Urls <- urls
    wg.Done()
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q... %d
", url, depth)
    if depth <= 0 {
        return
    }
    uc := make(chan []string)
    wg.Add(1)
    go GatherUrls(url, fetcher, uc)
    urls, _ := <-uc
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q
", u)
            wg.Add(1)
            go Crawl(u, depth-1, fetcher)
        } else {
            cache[u] = true
        }
    }
    wg.Done()
}

func main() {
    cache = make(map[string]bool)
    wg.Add(1)
    go Crawl("https://golang.org/", 4, fetcher)
    wg.Wait()
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}