1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // The linkcheck command finds missing links in the godoc website. 6 // It crawls a URL recursively and notes URLs and URL fragments 7 // that it's seen and prints a report of missing links at the end. 8 package main 9 10 import ( 11 "errors" 12 "flag" 13 "fmt" 14 "io/ioutil" 15 "log" 16 "net/http" 17 "os" 18 "regexp" 19 "strings" 20 "sync" 21 ) 22 23 var ( 24 root = flag.String("root", "http://localhost:6060", "Root to crawl") 25 verbose = flag.Bool("verbose", false, "verbose") 26 ) 27 28 var wg sync.WaitGroup // outstanding fetches 29 var urlq = make(chan string) // URLs to crawl 30 31 // urlFrag is a URL and its optional #fragment (without the #) 32 type urlFrag struct { 33 url, frag string 34 } 35 36 var ( 37 mu sync.Mutex 38 crawled = make(map[string]bool) // URL without fragment -> true 39 neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it 40 ) 41 42 var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`) 43 44 // Owned by crawlLoop goroutine: 45 var ( 46 linkSources = make(map[string][]string) // url no fragment -> sources 47 fragExists = make(map[urlFrag]bool) 48 problems []string 49 ) 50 51 func localLinks(body string) (links []string) { 52 seen := map[string]bool{} 53 mv := aRx.FindAllStringSubmatch(body, -1) 54 for _, m := range mv { 55 ref := m[1] 56 if strings.HasPrefix(ref, "/src/") { 57 continue 58 } 59 if !seen[ref] { 60 seen[ref] = true 61 links = append(links, m[1]) 62 } 63 } 64 return 65 } 66 67 var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`) 68 69 func pageIDs(body string) (ids []string) { 70 mv := idRx.FindAllStringSubmatch(body, -1) 71 for _, m := range mv { 72 ids = append(ids, m[1]) 73 } 74 return 75 } 76 77 // url may contain a #fragment, and the fragment is then noted as needing to exist. 78 func crawl(url string, sourceURL string) { 79 if strings.Contains(url, "/devel/release") { 80 return 81 } 82 mu.Lock() 83 defer mu.Unlock() 84 var frag string 85 if i := strings.Index(url, "#"); i >= 0 { 86 frag = url[i+1:] 87 url = url[:i] 88 if frag != "" { 89 uf := urlFrag{url, frag} 90 neededFrags[uf] = append(neededFrags[uf], sourceURL) 91 } 92 } 93 if crawled[url] { 94 return 95 } 96 crawled[url] = true 97 98 wg.Add(1) 99 go func() { 100 urlq <- url 101 }() 102 } 103 104 func addProblem(url, errmsg string) { 105 msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url]) 106 if *verbose { 107 log.Print(msg) 108 } 109 problems = append(problems, msg) 110 } 111 112 func crawlLoop() { 113 for url := range urlq { 114 if err := doCrawl(url); err != nil { 115 addProblem(url, err.Error()) 116 } 117 } 118 } 119 120 func doCrawl(url string) error { 121 defer wg.Done() 122 123 req, err := http.NewRequest("GET", url, nil) 124 if err != nil { 125 return err 126 } 127 res, err := http.DefaultTransport.RoundTrip(req) 128 if err != nil { 129 return err 130 } 131 // Handle redirects. 132 if res.StatusCode/100 == 3 { 133 newURL, err := res.Location() 134 if err != nil { 135 return fmt.Errorf("resolving redirect: %v", err) 136 } 137 if !strings.HasPrefix(newURL.String(), *root) { 138 // Skip off-site redirects. 139 return nil 140 } 141 crawl(newURL.String(), url) 142 return nil 143 } 144 if res.StatusCode != 200 { 145 return errors.New(res.Status) 146 } 147 slurp, err := ioutil.ReadAll(res.Body) 148 res.Body.Close() 149 if err != nil { 150 log.Fatalf("Error reading %s body: %v", url, err) 151 } 152 if *verbose { 153 log.Printf("Len of %s: %d", url, len(slurp)) 154 } 155 body := string(slurp) 156 for _, ref := range localLinks(body) { 157 if *verbose { 158 log.Printf(" links to %s", ref) 159 } 160 dest := *root + ref 161 linkSources[dest] = append(linkSources[dest], url) 162 crawl(dest, url) 163 } 164 for _, id := range pageIDs(body) { 165 if *verbose { 166 log.Printf(" url %s has #%s", url, id) 167 } 168 fragExists[urlFrag{url, id}] = true 169 } 170 return nil 171 } 172 173 func main() { 174 flag.Parse() 175 176 go crawlLoop() 177 crawl(*root, "") 178 179 wg.Wait() 180 close(urlq) 181 for uf, needers := range neededFrags { 182 if !fragExists[uf] { 183 problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers)) 184 } 185 } 186 187 for _, s := range problems { 188 fmt.Println(s) 189 } 190 if len(problems) > 0 { 191 os.Exit(1) 192 } 193 } 194