linkchecker: Organize state into a struct, and add report.

Moves all the state into a struct so it doesn't need to be passed around as much. Also adds a report showing how long it took and what it found. This includes a minor change: a failure to load a file is now an error, instead of being ignored. This should only happen if there is a permission error or some other shenanigans going on.
2021-01-11 10:13:08 -08:00 · 2021-01-11 10:13:08 -08:00 · 9e11b61e8d
commit 9e11b61e8d
parent e4ca1662f2
1 changed files with 261 additions and 212 deletions
--- a/src/tools/linkchecker/main.rs
+++ b/src/tools/linkchecker/main.rs
@ -20,6 +20,7 @@ use std::env;
 use std::fs;
 use std::path::{Component, Path, PathBuf};
 use std::rc::Rc;
+use std::time::Instant;

 use once_cell::sync::Lazy;
 use regex::Regex;
@ -89,16 +90,41 @@ macro_rules! t {
 fn main() {
    let docs = env::args_os().nth(1).unwrap();
    let docs = env::current_dir().unwrap().join(docs);
-    let mut errors = false;
-    walk(&mut HashMap::new(), &docs, &docs, &mut errors);
-    if errors {
-        panic!("found some broken links");
+    let mut checker = Checker {
+        root: docs.clone(),
+        cache: HashMap::new(),
+        errors: 0,
+        start: Instant::now(),
+        html_files: 0,
+        html_redirects: 0,
+        links_checked: 0,
+        links_ignored_external: 0,
+        links_ignored_exception: 0,
+        intra_doc_exceptions: 0,
+    };
+    checker.walk(&docs);
+    checker.report();
+    if checker.errors != 0 {
+        println!("found some broken links");
+        std::process::exit(1);
    }
 }

+struct Checker {
+    root: PathBuf,
+    cache: Cache,
+    errors: u32,
+    start: Instant,
+    html_files: u32,
+    html_redirects: u32,
+    links_checked: u32,
+    links_ignored_external: u32,
+    links_ignored_exception: u32,
+    intra_doc_exceptions: u32,
+}
+
 #[derive(Debug)]
 pub enum LoadError {
-    IOError(std::io::Error),
    BrokenRedirect(PathBuf, std::io::Error),
    IsRedirect,
 }
@ -131,13 +157,13 @@ fn small_url_encode(s: &str) -> String {
 }

 impl FileEntry {
-    fn parse_ids(&mut self, file: &Path, contents: &str, errors: &mut bool) {
+    fn parse_ids(&mut self, file: &Path, contents: &str, errors: &mut u32) {
        if self.ids.is_empty() {
            with_attrs_in_source(contents, " id", |fragment, i, _| {
                let frag = fragment.trim_start_matches("#").to_owned();
                let encoded = small_url_encode(&frag);
                if !self.ids.insert(frag) {
-                    *errors = true;
+                    *errors += 1;
                    println!("{}:{}: id is not unique: `{}`", file.display(), i, fragment);
                }
                // Just in case, we also add the encoded id.
@ -147,22 +173,237 @@ impl FileEntry {
    }
 }

-fn walk(cache: &mut Cache, root: &Path, dir: &Path, errors: &mut bool) {
-    for entry in t!(dir.read_dir()).map(|e| t!(e)) {
-        let path = entry.path();
-        let kind = t!(entry.file_type());
-        if kind.is_dir() {
-            walk(cache, root, &path, errors);
-        } else {
-            let pretty_path = check(cache, root, &path, errors);
-            if let Some(pretty_path) = pretty_path {
-                let entry = cache.get_mut(&pretty_path).unwrap();
-                // we don't need the source anymore,
-                // so drop to reduce memory-usage
-                entry.source = Rc::new(String::new());
+impl Checker {
+    fn walk(&mut self, dir: &Path) {
+        for entry in t!(dir.read_dir()).map(|e| t!(e)) {
+            let path = entry.path();
+            let kind = t!(entry.file_type());
+            if kind.is_dir() {
+                self.walk(&path);
+            } else {
+                let pretty_path = self.check(&path);
+                if let Some(pretty_path) = pretty_path {
+                    let entry = self.cache.get_mut(&pretty_path).unwrap();
+                    // we don't need the source anymore,
+                    // so drop to reduce memory-usage
+                    entry.source = Rc::new(String::new());
+                }
            }
        }
    }
+
+    fn check(&mut self, file: &Path) -> Option<PathBuf> {
+        // Ignore non-HTML files.
+        if file.extension().and_then(|s| s.to_str()) != Some("html") {
+            return None;
+        }
+        self.html_files += 1;
+
+        let res = self.load_file(file, SkipRedirect);
+        let (pretty_file, contents) = match res {
+            Ok(res) => res,
+            Err(_) => return None,
+        };
+        self.cache.get_mut(&pretty_file).unwrap().parse_ids(
+            &pretty_file,
+            &contents,
+            &mut self.errors,
+        );
+
+        // Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
+        with_attrs_in_source(&contents, " href", |url, i, base| {
+            // Ignore external URLs
+            if url.starts_with("http:")
+                || url.starts_with("https:")
+                || url.starts_with("javascript:")
+                || url.starts_with("ftp:")
+                || url.starts_with("irc:")
+                || url.starts_with("data:")
+            {
+                self.links_ignored_external += 1;
+                return;
+            }
+            self.links_checked += 1;
+            let (url, fragment) = match url.split_once('#') {
+                None => (url, None),
+                Some((url, fragment)) => (url, Some(fragment)),
+            };
+            // NB: the `splitn` always succeeds, even if the delimiter is not present.
+            let url = url.splitn(2, '?').next().unwrap();
+
+            // Once we've plucked out the URL, parse it using our base url and
+            // then try to extract a file path.
+            let mut path = file.to_path_buf();
+            if !base.is_empty() || !url.is_empty() {
+                path.pop();
+                for part in Path::new(base).join(url).components() {
+                    match part {
+                        Component::Prefix(_) | Component::RootDir => {
+                            // Avoid absolute paths as they make the docs not
+                            // relocatable by making assumptions on where the docs
+                            // are hosted relative to the site root.
+                            self.errors += 1;
+                            println!(
+                                "{}:{}: absolute path - {}",
+                                pretty_file.display(),
+                                i + 1,
+                                Path::new(base).join(url).display()
+                            );
+                            return;
+                        }
+                        Component::CurDir => {}
+                        Component::ParentDir => {
+                            path.pop();
+                        }
+                        Component::Normal(s) => {
+                            path.push(s);
+                        }
+                    }
+                }
+            }
+
+            // Alright, if we've found a file name then this file had better
+            // exist! If it doesn't then we register and print an error.
+            if path.exists() {
+                if path.is_dir() {
+                    // Links to directories show as directory listings when viewing
+                    // the docs offline so it's best to avoid them.
+                    self.errors += 1;
+                    let pretty_path = path.strip_prefix(&self.root).unwrap_or(&path);
+                    println!(
+                        "{}:{}: directory link - {}",
+                        pretty_file.display(),
+                        i + 1,
+                        pretty_path.display()
+                    );
+                    return;
+                }
+                if let Some(extension) = path.extension() {
+                    // Ignore none HTML files.
+                    if extension != "html" {
+                        return;
+                    }
+                }
+                let res = self.load_file(&path, FromRedirect(false));
+                let (pretty_path, contents) = match res {
+                    Ok(res) => res,
+                    Err(LoadError::BrokenRedirect(target, _)) => {
+                        self.errors += 1;
+                        println!(
+                            "{}:{}: broken redirect to {}",
+                            pretty_file.display(),
+                            i + 1,
+                            target.display()
+                        );
+                        return;
+                    }
+                    Err(LoadError::IsRedirect) => unreachable!(),
+                };
+
+                if let Some(ref fragment) = fragment {
+                    // Fragments like `#1-6` are most likely line numbers to be
+                    // interpreted by javascript, so we're ignoring these
+                    if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
+                        return;
+                    }
+
+                    // These appear to be broken in mdbook right now?
+                    if fragment.starts_with('-') {
+                        return;
+                    }
+
+                    let entry = self.cache.get_mut(&pretty_path).unwrap();
+                    entry.parse_ids(&pretty_path, &contents, &mut self.errors);
+
+                    if entry.ids.contains(*fragment) {
+                        return;
+                    }
+
+                    if is_exception(file, &format!("#{}", fragment)) {
+                        self.links_ignored_exception += 1;
+                    } else {
+                        self.errors += 1;
+                        print!("{}:{}: broken link fragment ", pretty_file.display(), i + 1);
+                        println!("`#{}` pointing to `{}`", fragment, pretty_path.display());
+                    };
+                }
+            } else {
+                let pretty_path = path.strip_prefix(&self.root).unwrap_or(&path);
+                if is_exception(file, pretty_path.to_str().unwrap()) {
+                } else {
+                    self.errors += 1;
+                    print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
+                    println!("{}", pretty_path.display());
+                }
+            }
+        });
+
+        // Search for intra-doc links that rustdoc didn't warn about
+        // FIXME(#77199, 77200) Rustdoc should just warn about these directly.
+        // NOTE: only looks at one line at a time; in practice this should find most links
+        for (i, line) in contents.lines().enumerate() {
+            for broken_link in BROKEN_INTRA_DOC_LINK.captures_iter(line) {
+                if is_intra_doc_exception(file, &broken_link[1]) {
+                    self.intra_doc_exceptions += 1;
+                } else {
+                    self.errors += 1;
+                    print!("{}:{}: broken intra-doc link - ", pretty_file.display(), i + 1);
+                    println!("{}", &broken_link[0]);
+                }
+            }
+        }
+        Some(pretty_file)
+    }
+
+    fn load_file(
+        &mut self,
+        file: &Path,
+        redirect: Redirect,
+    ) -> Result<(PathBuf, Rc<String>), LoadError> {
+        let pretty_file = PathBuf::from(file.strip_prefix(&self.root).unwrap_or(&file));
+
+        let (maybe_redirect, contents) = match self.cache.entry(pretty_file.clone()) {
+            Entry::Occupied(entry) => (None, entry.get().source.clone()),
+            Entry::Vacant(entry) => {
+                let contents = match fs::read_to_string(file) {
+                    Ok(s) => Rc::new(s),
+                    Err(err) => {
+                        return Err(if let FromRedirect(true) = redirect {
+                            LoadError::BrokenRedirect(file.to_path_buf(), err)
+                        } else {
+                            panic!("error loading {}: {}", file.display(), err);
+                        });
+                    }
+                };
+
+                let maybe = maybe_redirect(&contents);
+                if maybe.is_some() {
+                    self.html_redirects += 1;
+                    if let SkipRedirect = redirect {
+                        return Err(LoadError::IsRedirect);
+                    }
+                } else {
+                    entry.insert(FileEntry { source: contents.clone(), ids: HashSet::new() });
+                }
+                (maybe, contents)
+            }
+        };
+        match maybe_redirect.map(|url| file.parent().unwrap().join(url)) {
+            Some(redirect_file) => self.load_file(&redirect_file, FromRedirect(true)),
+            None => Ok((pretty_file, contents)),
+        }
+    }
+
+    fn report(&self) {
+        println!("checked links in: {:.1}s", self.start.elapsed().as_secs_f64());
+        println!("number of HTML files scanned: {}", self.html_files);
+        println!("number of HTML redirects found: {}", self.html_redirects);
+        println!("number of links checked: {}", self.links_checked);
+        println!("number of links ignored due to external: {}", self.links_ignored_external);
+        println!("number of links ignored due to exceptions: {}", self.links_ignored_exception);
+        println!("number of intra doc links ignored: {}", self.intra_doc_exceptions);
+        println!("errors found: {}", self.errors);
+    }
 }

 fn is_intra_doc_exception(file: &Path, link: &str) -> bool {
@ -191,198 +432,6 @@ fn is_exception(file: &Path, link: &str) -> bool {
    }
 }

-fn check(cache: &mut Cache, root: &Path, file: &Path, errors: &mut bool) -> Option<PathBuf> {
-    // Ignore non-HTML files.
-    if file.extension().and_then(|s| s.to_str()) != Some("html") {
-        return None;
-    }
-
-    let res = load_file(cache, root, file, SkipRedirect);
-    let (pretty_file, contents) = match res {
-        Ok(res) => res,
-        Err(_) => return None,
-    };
-    {
-        cache.get_mut(&pretty_file).unwrap().parse_ids(&pretty_file, &contents, errors);
-    }
-
-    // Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
-    with_attrs_in_source(&contents, " href", |url, i, base| {
-        // Ignore external URLs
-        if url.starts_with("http:")
-            || url.starts_with("https:")
-            || url.starts_with("javascript:")
-            || url.starts_with("ftp:")
-            || url.starts_with("irc:")
-            || url.starts_with("data:")
-        {
-            return;
-        }
-        let (url, fragment) = match url.split_once('#') {
-            None => (url, None),
-            Some((url, fragment)) => (url, Some(fragment)),
-        };
-        // NB: the `splitn` always succeeds, even if the delimiter is not present.
-        let url = url.splitn(2, '?').next().unwrap();
-
-        // Once we've plucked out the URL, parse it using our base url and
-        // then try to extract a file path.
-        let mut path = file.to_path_buf();
-        if !base.is_empty() || !url.is_empty() {
-            path.pop();
-            for part in Path::new(base).join(url).components() {
-                match part {
-                    Component::Prefix(_) | Component::RootDir => {
-                        // Avoid absolute paths as they make the docs not
-                        // relocatable by making assumptions on where the docs
-                        // are hosted relative to the site root.
-                        *errors = true;
-                        println!(
-                            "{}:{}: absolute path - {}",
-                            pretty_file.display(),
-                            i + 1,
-                            Path::new(base).join(url).display()
-                        );
-                        return;
-                    }
-                    Component::CurDir => {}
-                    Component::ParentDir => {
-                        path.pop();
-                    }
-                    Component::Normal(s) => {
-                        path.push(s);
-                    }
-                }
-            }
-        }
-
-        // Alright, if we've found a file name then this file had better
-        // exist! If it doesn't then we register and print an error.
-        if path.exists() {
-            if path.is_dir() {
-                // Links to directories show as directory listings when viewing
-                // the docs offline so it's best to avoid them.
-                *errors = true;
-                let pretty_path = path.strip_prefix(root).unwrap_or(&path);
-                println!(
-                    "{}:{}: directory link - {}",
-                    pretty_file.display(),
-                    i + 1,
-                    pretty_path.display()
-                );
-                return;
-            }
-            if let Some(extension) = path.extension() {
-                // Ignore none HTML files.
-                if extension != "html" {
-                    return;
-                }
-            }
-            let res = load_file(cache, root, &path, FromRedirect(false));
-            let (pretty_path, contents) = match res {
-                Ok(res) => res,
-                Err(LoadError::IOError(err)) => {
-                    panic!("error loading {}: {}", path.display(), err);
-                }
-                Err(LoadError::BrokenRedirect(target, _)) => {
-                    *errors = true;
-                    println!(
-                        "{}:{}: broken redirect to {}",
-                        pretty_file.display(),
-                        i + 1,
-                        target.display()
-                    );
-                    return;
-                }
-                Err(LoadError::IsRedirect) => unreachable!(),
-            };
-
-            if let Some(ref fragment) = fragment {
-                // Fragments like `#1-6` are most likely line numbers to be
-                // interpreted by javascript, so we're ignoring these
-                if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
-                    return;
-                }
-
-                // These appear to be broken in mdbook right now?
-                if fragment.starts_with('-') {
-                    return;
-                }
-
-                let entry = &mut cache.get_mut(&pretty_path).unwrap();
-                entry.parse_ids(&pretty_path, &contents, errors);
-
-                if !entry.ids.contains(*fragment) && !is_exception(file, &format!("#{}", fragment))
-                {
-                    *errors = true;
-                    print!("{}:{}: broken link fragment ", pretty_file.display(), i + 1);
-                    println!("`#{}` pointing to `{}`", fragment, pretty_path.display());
-                };
-            }
-        } else {
-            let pretty_path = path.strip_prefix(root).unwrap_or(&path);
-            if !is_exception(file, pretty_path.to_str().unwrap()) {
-                *errors = true;
-                print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
-                println!("{}", pretty_path.display());
-            }
-        }
-    });
-
-    // Search for intra-doc links that rustdoc didn't warn about
-    // FIXME(#77199, 77200) Rustdoc should just warn about these directly.
-    // NOTE: only looks at one line at a time; in practice this should find most links
-    for (i, line) in contents.lines().enumerate() {
-        for broken_link in BROKEN_INTRA_DOC_LINK.captures_iter(line) {
-            if !is_intra_doc_exception(file, &broken_link[1]) {
-                *errors = true;
-                print!("{}:{}: broken intra-doc link - ", pretty_file.display(), i + 1);
-                println!("{}", &broken_link[0]);
-            }
-        }
-    }
-    Some(pretty_file)
-}
-
-fn load_file(
-    cache: &mut Cache,
-    root: &Path,
-    file: &Path,
-    redirect: Redirect,
-) -> Result<(PathBuf, Rc<String>), LoadError> {
-    let pretty_file = PathBuf::from(file.strip_prefix(root).unwrap_or(&file));
-
-    let (maybe_redirect, contents) = match cache.entry(pretty_file.clone()) {
-        Entry::Occupied(entry) => (None, entry.get().source.clone()),
-        Entry::Vacant(entry) => {
-            let contents = match fs::read_to_string(file) {
-                Ok(s) => Rc::new(s),
-                Err(err) => {
-                    return Err(if let FromRedirect(true) = redirect {
-                        LoadError::BrokenRedirect(file.to_path_buf(), err)
-                    } else {
-                        LoadError::IOError(err)
-                    });
-                }
-            };
-
-            let maybe = maybe_redirect(&contents);
-            if maybe.is_some() {
-                if let SkipRedirect = redirect {
-                    return Err(LoadError::IsRedirect);
-                }
-            } else {
-                entry.insert(FileEntry { source: contents.clone(), ids: HashSet::new() });
-            }
-            (maybe, contents)
-        }
-    };
-    match maybe_redirect.map(|url| file.parent().unwrap().join(url)) {
-        Some(redirect_file) => load_file(cache, root, &redirect_file, FromRedirect(true)),
-        None => Ok((pretty_file, contents)),
-    }
-}
-
 fn maybe_redirect(source: &str) -> Option<String> {
    const REDIRECT: &str = "<p>Redirecting to <a href=";