Home | History | Annotate | Download | only in linkcheck
      1 // Copyright 2013 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // The linkcheck command finds missing links in the godoc website.
      6 // It crawls a URL recursively and notes URLs and URL fragments
      7 // that it's seen and prints a report of missing links at the end.
      8 package main
      9 
     10 import (
     11 	"errors"
     12 	"flag"
     13 	"fmt"
     14 	"io/ioutil"
     15 	"log"
     16 	"net/http"
     17 	"os"
     18 	"regexp"
     19 	"strings"
     20 	"sync"
     21 )
     22 
     23 var (
     24 	root    = flag.String("root", "http://localhost:6060", "Root to crawl")
     25 	verbose = flag.Bool("verbose", false, "verbose")
     26 )
     27 
     28 var wg sync.WaitGroup        // outstanding fetches
     29 var urlq = make(chan string) // URLs to crawl
     30 
     31 // urlFrag is a URL and its optional #fragment (without the #)
     32 type urlFrag struct {
     33 	url, frag string
     34 }
     35 
     36 var (
     37 	mu          sync.Mutex
     38 	crawled     = make(map[string]bool)      // URL without fragment -> true
     39 	neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
     40 )
     41 
     42 var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
     43 
     44 // Owned by crawlLoop goroutine:
     45 var (
     46 	linkSources = make(map[string][]string) // url no fragment -> sources
     47 	fragExists  = make(map[urlFrag]bool)
     48 	problems    []string
     49 )
     50 
     51 func localLinks(body string) (links []string) {
     52 	seen := map[string]bool{}
     53 	mv := aRx.FindAllStringSubmatch(body, -1)
     54 	for _, m := range mv {
     55 		ref := m[1]
     56 		if strings.HasPrefix(ref, "/src/") {
     57 			continue
     58 		}
     59 		if !seen[ref] {
     60 			seen[ref] = true
     61 			links = append(links, m[1])
     62 		}
     63 	}
     64 	return
     65 }
     66 
     67 var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
     68 
     69 func pageIDs(body string) (ids []string) {
     70 	mv := idRx.FindAllStringSubmatch(body, -1)
     71 	for _, m := range mv {
     72 		ids = append(ids, m[1])
     73 	}
     74 	return
     75 }
     76 
     77 // url may contain a #fragment, and the fragment is then noted as needing to exist.
     78 func crawl(url string, sourceURL string) {
     79 	if strings.Contains(url, "/devel/release") {
     80 		return
     81 	}
     82 	mu.Lock()
     83 	defer mu.Unlock()
     84 	var frag string
     85 	if i := strings.Index(url, "#"); i >= 0 {
     86 		frag = url[i+1:]
     87 		url = url[:i]
     88 		if frag != "" {
     89 			uf := urlFrag{url, frag}
     90 			neededFrags[uf] = append(neededFrags[uf], sourceURL)
     91 		}
     92 	}
     93 	if crawled[url] {
     94 		return
     95 	}
     96 	crawled[url] = true
     97 
     98 	wg.Add(1)
     99 	go func() {
    100 		urlq <- url
    101 	}()
    102 }
    103 
    104 func addProblem(url, errmsg string) {
    105 	msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
    106 	if *verbose {
    107 		log.Print(msg)
    108 	}
    109 	problems = append(problems, msg)
    110 }
    111 
    112 func crawlLoop() {
    113 	for url := range urlq {
    114 		if err := doCrawl(url); err != nil {
    115 			addProblem(url, err.Error())
    116 		}
    117 	}
    118 }
    119 
    120 func doCrawl(url string) error {
    121 	defer wg.Done()
    122 
    123 	req, err := http.NewRequest("GET", url, nil)
    124 	if err != nil {
    125 		return err
    126 	}
    127 	res, err := http.DefaultTransport.RoundTrip(req)
    128 	if err != nil {
    129 		return err
    130 	}
    131 	// Handle redirects.
    132 	if res.StatusCode/100 == 3 {
    133 		newURL, err := res.Location()
    134 		if err != nil {
    135 			return fmt.Errorf("resolving redirect: %v", err)
    136 		}
    137 		if !strings.HasPrefix(newURL.String(), *root) {
    138 			// Skip off-site redirects.
    139 			return nil
    140 		}
    141 		crawl(newURL.String(), url)
    142 		return nil
    143 	}
    144 	if res.StatusCode != 200 {
    145 		return errors.New(res.Status)
    146 	}
    147 	slurp, err := ioutil.ReadAll(res.Body)
    148 	res.Body.Close()
    149 	if err != nil {
    150 		log.Fatalf("Error reading %s body: %v", url, err)
    151 	}
    152 	if *verbose {
    153 		log.Printf("Len of %s: %d", url, len(slurp))
    154 	}
    155 	body := string(slurp)
    156 	for _, ref := range localLinks(body) {
    157 		if *verbose {
    158 			log.Printf("  links to %s", ref)
    159 		}
    160 		dest := *root + ref
    161 		linkSources[dest] = append(linkSources[dest], url)
    162 		crawl(dest, url)
    163 	}
    164 	for _, id := range pageIDs(body) {
    165 		if *verbose {
    166 			log.Printf(" url %s has #%s", url, id)
    167 		}
    168 		fragExists[urlFrag{url, id}] = true
    169 	}
    170 	return nil
    171 }
    172 
    173 func main() {
    174 	flag.Parse()
    175 
    176 	go crawlLoop()
    177 	crawl(*root, "")
    178 
    179 	wg.Wait()
    180 	close(urlq)
    181 	for uf, needers := range neededFrags {
    182 		if !fragExists[uf] {
    183 			problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
    184 		}
    185 	}
    186 
    187 	for _, s := range problems {
    188 		fmt.Println(s)
    189 	}
    190 	if len(problems) > 0 {
    191 		os.Exit(1)
    192 	}
    193 }
    194