From 0f24c88c0e1db854fa758b1723083bc51f796cd4 Mon Sep 17 00:00:00 2001 From: Joshua Nelson Date: Sat, 9 Jan 2021 10:15:28 -0500 Subject: [PATCH] Check for url-encoded fragments --- CHANGELOG.md | 2 ++ Cargo.lock | 1 + Cargo.toml | 2 ++ src/check.rs | 13 +++++++++++++ tests/broken_links.rs | 3 ++- tests/broken_links/src/lib.rs | 1 + tests/simple_project/src/lib.rs | 4 ++++ 7 files changed, 25 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1cfd54..6d520bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,10 @@ * When a website gives 405 Method Not Supported for HEAD requests, fall back to GET. In particular, this no longer marks all links to play.rust-lang.org as broken. [PR#136] +* URL-encoded fragments, like `#%E2%80%A0`, are now decoded. [PR#141] [PR#136]: https://github.com/deadlinks/cargo-deadlinks/pull/136 +[PR#141]: https://github.com/deadlinks/cargo-deadlinks/pull/141 #### Changed diff --git a/Cargo.lock b/Cargo.lock index 3ddd60e..d18e02e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -102,6 +102,7 @@ dependencies = [ "mockito", "num_cpus", "once_cell", + "percent-encoding", "pico-args", "predicates", "rayon", diff --git a/Cargo.toml b/Cargo.toml index b2c6596..aeb0454 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,8 @@ ureq = { version = "1.5.4", features = ["tls"], default-features = false } serde = "1.0" serde_derive = "1.0" url = "2" +# Try to keep this in sync with `url`'s version +percent-encoding = "2" walkdir = "2.1" [dev-dependencies] diff --git a/src/check.rs b/src/check.rs index 80f9e6a..3531108 100644 --- a/src/check.rs +++ b/src/check.rs @@ -156,6 +156,19 @@ fn is_fragment_available( return Ok(()); } + // Try again with percent-decoding. + // NOTE: This isn't done unconditionally because it's possible the fragment it's linking to was also percent-encoded. + match percent_encoding::percent_decode(fragment.as_bytes()).decode_utf8() { + Ok(cow) => { + if fragments.contains(&*cow) { + return Ok(()); + } + } + // If this was invalid UTF8 after percent-decoding, it can't be in the file (since we have a `String`, not opaque bytes). + // Assume it wasn't meant to be url-encoded. + Err(err) => warn!("{} url-decoded to invalid UTF8: {}", fragment, err), + } + // Rust documentation uses `#n-m` fragments and JavaScript to highlight // a range of lines in HTML of source code, an element with `id` // attribute of (literal) "#n-m" will not exist, but elements with diff --git a/tests/broken_links.rs b/tests/broken_links.rs index 9d5b195..4f4446a 100644 --- a/tests/broken_links.rs +++ b/tests/broken_links.rs @@ -22,7 +22,8 @@ fn reports_broken_links() { .and(contains("Broken intra-doc link to [links]!")) .and(contains( "Fragment #fragments at index.html does not exist!", - )), + )) + .and(contains("Fragment #%FF at index.html does not exist!")), ); } diff --git a/tests/broken_links/src/lib.rs b/tests/broken_links/src/lib.rs index 46b298c..f45171a 100644 --- a/tests/broken_links/src/lib.rs +++ b/tests/broken_links/src/lib.rs @@ -2,3 +2,4 @@ //! with [intra-doc](links) that will be emitted as HTML //! and intra-doc [`links`][x] that won't. //! It also has [links to](#fragments). +//! [Non-unicode link](#%FF) diff --git a/tests/simple_project/src/lib.rs b/tests/simple_project/src/lib.rs index 64db733..7e22a45 100644 --- a/tests/simple_project/src/lib.rs +++ b/tests/simple_project/src/lib.rs @@ -1,3 +1,7 @@ +//! [Non-ascii link](#†) +//! +//!
Some text
+ /// Foo function /// /// Has something to do with [bar](./fn.bar.html).