Added web crawler exercise

3 years ago · 1d46b526ee
parent b0f2d371e3
commit 1d46b526ee
1 changed files with 98 additions and 0 deletions
--- a/exercise-web-crawler/exercise-web-crawler.go
+++ b/exercise-web-crawler/exercise-web-crawler.go
@ -0,0 +1,98 @@
+package main
+
+import (
+	"fmt"
+	"sync"
+)
+
+var mu sync.Mutex
+var visited = map[string]bool{}
+var wgCrawl sync.WaitGroup
+
+type Fetcher interface {
+	// Fetch returns the body of URL and
+	// a slice of URLs found on that page.
+	Fetch(url string) (body string, urls []string, err error)
+}
+
+// Crawl uses fetcher to recursively crawl
+// pages starting with url, to a maximum of depth.
+func Crawl(url string, depth int, fetcher Fetcher) {
+	defer wgCrawl.Done()
+
+	if depth <= 0 {
+		return
+	}
+	body, urls, err := fetcher.Fetch(url)
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+	fmt.Printf("found: %s %q\n", url, body)
+	for _, u := range urls {
+		mu.Lock()
+		skip := visited[u]
+		visited[u] = true
+		mu.Unlock()
+		if !skip {
+			wgCrawl.Add(1)
+			go Crawl(u, depth-1, fetcher)
+		}
+	}
+	return
+}
+
+func main() {
+	wgCrawl.Add(1)
+	Crawl("https://golang.org/", 4, fetcher)
+	wgCrawl.Wait()
+}
+
+// fakeFetcher is Fetcher that returns canned results.
+type fakeFetcher map[string]*fakeResult
+
+type fakeResult struct {
+	body string
+	urls []string
+}
+
+func (f fakeFetcher) Fetch(url string) (string, []string, error) {
+	if res, ok := f[url]; ok {
+		return res.body, res.urls, nil
+	}
+	return "", nil, fmt.Errorf("not found: %s", url)
+}
+
+// fetcher is a populated fakeFetcher.
+var fetcher = fakeFetcher{
+	"https://golang.org/": &fakeResult{
+		"The Go Programming Language",
+		[]string{
+			"https://golang.org/pkg/",
+			"https://golang.org/cmd/",
+		},
+	},
+	"https://golang.org/pkg/": &fakeResult{
+		"Packages",
+		[]string{
+			"https://golang.org/",
+			"https://golang.org/cmd/",
+			"https://golang.org/pkg/fmt/",
+			"https://golang.org/pkg/os/",
+		},
+	},
+	"https://golang.org/pkg/fmt/": &fakeResult{
+		"Package fmt",
+		[]string{
+			"https://golang.org/",
+			"https://golang.org/pkg/",
+		},
+	},
+	"https://golang.org/pkg/os/": &fakeResult{
+		"Package os",
+		[]string{
+			"https://golang.org/",
+			"https://golang.org/pkg/",
+		},
+	},
+}