Skip to content

Commit

Permalink
Update html5ever, drop kuchiki
Browse files Browse the repository at this point in the history
  • Loading branch information
andy128k committed Sep 9, 2023
1 parent 69003a6 commit 1ec0ae6
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 102 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ edition = "2018"
[dependencies]
regex = "1"
lazy_static = "1"
html5ever = "0.25"
kuchiki = "0.8"
html5ever = "0.26"
markup5ever_rcdom = "0.2"
25 changes: 4 additions & 21 deletions src/errors.rs
Original file line number Diff line number Diff line change
@@ -1,37 +1,20 @@
//! Error types, which can be emited by sanitization procedure.
//! Errors, which can be emited by sanitization procedure.

use std::error::Error;
use std::fmt;

/// Sanitization error
#[derive(Debug)]
pub enum SanitizeError {
/// UTF-8 decoding error
StrUtf8Error(std::str::Utf8Error),

/// UTF-8 decoding error
Utf8Error(std::string::FromUtf8Error),

/// Serialization error
SerializeError(std::io::Error),
}
pub struct SanitizeError(pub(crate) Box<dyn Error>);

impl fmt::Display for SanitizeError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SanitizeError::StrUtf8Error(e) => write!(f, "UTF-8 decode error {}", e),
SanitizeError::Utf8Error(e) => write!(f, "UTF-8 decode error {}", e),
SanitizeError::SerializeError(e) => write!(f, "Serialization error {}", e),
}
write!(f, "{}", self.0)
}
}

impl Error for SanitizeError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
match self {
SanitizeError::StrUtf8Error(e) => Some(e),
SanitizeError::Utf8Error(e) => Some(e),
SanitizeError::SerializeError(e) => Some(e),
}
self.0.source()
}
}
19 changes: 10 additions & 9 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,22 @@ mod tests;

use crate::errors::SanitizeError;
use crate::rules::Rules;
use std::error::Error;

/// Sanitize HTML bytes
pub fn sanitize_bytes(rules: &Rules, input: &[u8]) -> Result<Vec<u8>, SanitizeError> {
let input_str = std::str::from_utf8(input).map_err(SanitizeError::StrUtf8Error)?;
let dom = parse::parse_str(input_str);
let new_dom = sanitize::sanitize_dom(&dom, rules);
let result_bytes = parse::unparse_bytes(&new_dom)?;
Ok(result_bytes)
fn inner(rules: &Rules, input: &[u8]) -> Result<Vec<u8>, Box<dyn Error>> {
let dom = parse::parse_dom(input)?;
let new_document = sanitize::sanitize_dom(&dom, rules);
let result_bytes = parse::unparse_document(&new_document)?;
Ok(result_bytes)
}
inner(rules, input).map_err(SanitizeError)
}

/// Sanitize HTML string
pub fn sanitize_str(rules: &Rules, input: &str) -> Result<String, SanitizeError> {
let dom = parse::parse_str(input);
let new_dom = sanitize::sanitize_dom(&dom, rules);
let result_bytes = parse::unparse_bytes(&new_dom)?;
let result_string = String::from_utf8(result_bytes).map_err(SanitizeError::Utf8Error)?;
let result_bytes = sanitize_bytes(rules, input.as_bytes())?;
let result_string = String::from_utf8(result_bytes).map_err(|e| SanitizeError(Box::new(e)))?;
Ok(result_string)
}
25 changes: 17 additions & 8 deletions src/parse.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,31 @@
use super::errors::SanitizeError;
use html5ever::driver::ParseOpts;
use html5ever::parse_document;
use html5ever::{
interface::QualName,
local_name, namespace_prefix, namespace_url, ns, serialize,
serialize::{SerializeOpts, TraversalScope},
tendril::TendrilSink,
};
use kuchiki::{parse_html_with_options, NodeRef, ParseOpts};
use markup5ever_rcdom::{Node, RcDom, SerializableHandle};
use std::default::Default;
use std::error::Error;
use std::io::Cursor;
use std::rc::Rc;

pub(crate) fn parse_str(input: &str) -> NodeRef {
pub(crate) fn parse_dom(input: &[u8]) -> Result<RcDom, Box<dyn Error>> {
let mut opts = ParseOpts::default();
opts.tree_builder.drop_doctype = true;

let mut parser = parse_html_with_options(opts);
parser.process(input.into());
parser.finish()
let mut cursor = Cursor::new(input);

let dom = parse_document(RcDom::default(), opts)
.from_utf8()
.read_from(&mut cursor)?;

Ok(dom)
}

pub(crate) fn unparse_bytes(dom: &NodeRef) -> Result<Vec<u8>, SanitizeError> {
pub(crate) fn unparse_document(document: &Rc<Node>) -> Result<Vec<u8>, Box<dyn Error>> {
let mut buf: Vec<u8> = Vec::new();

let parent = QualName::new(
Expand All @@ -32,7 +40,8 @@ pub(crate) fn unparse_bytes(dom: &NodeRef) -> Result<Vec<u8>, SanitizeError> {
create_missing_parent: false,
};

serialize(&mut buf, dom, opts).map_err(SanitizeError::SerializeError)?;
let document: SerializableHandle = document.clone().into();
serialize(&mut buf, &document, opts)?;

Ok(buf)
}
114 changes: 52 additions & 62 deletions src/sanitize.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use crate::rules::{Element, Rules};
use html5ever::{interface::QualName, namespace_url, ns, LocalName};
use kuchiki::{Attribute, ElementData, ExpandedName, NodeData, NodeRef};
use html5ever::{
interface::QualName, namespace_url, ns, tendril::StrTendril, Attribute, LocalName,
};
use markup5ever_rcdom::{Node, NodeData, RcDom};
use std::{cell::RefCell, rc::Rc};

fn simple_qual_name(name: &str) -> QualName {
QualName::new(None, ns!(), LocalName::from(name))
Expand All @@ -14,29 +17,21 @@ fn qual_name_to_string(name: &QualName) -> String {
}
}

fn expanded_name_to_string(name: &ExpandedName) -> String {
if name.ns == ns!(html) || name.ns.is_empty() {
name.local.to_lowercase()
} else {
format!("{}:{}", name.ns.to_lowercase(), name.local.to_lowercase())
}
}

fn simple_element(
name: QualName,
attrs: Vec<(ExpandedName, Attribute)>,
children: Vec<NodeRef>,
) -> NodeRef {
let element = NodeRef::new_element(name, attrs);
for child in children {
child.detach();
element.append(child);
}
fn simple_element(name: QualName, attrs: Vec<Attribute>, children: Vec<Rc<Node>>) -> Rc<Node> {
let element = Node::new(NodeData::Element {
name,
attrs: RefCell::new(attrs),
template_contents: Default::default(),
mathml_annotation_xml_integration_point: Default::default(),
});
element.children.borrow_mut().extend(children);
element
}

fn create_space_text() -> NodeRef {
NodeRef::new_text(" ")
fn create_space_text() -> Rc<Node> {
Node::new(NodeData::Text {
contents: RefCell::new(" ".into()),
})
}

enum ElementAction<'t> {
Expand Down Expand Up @@ -64,47 +59,43 @@ fn element_action<'t>(element_name: &QualName, rules: &'t Rules) -> ElementActio
}
}

fn clean_nodes(nodes: impl IntoIterator<Item = NodeRef>, rules: &Rules) -> Vec<NodeRef> {
let mut result = Vec::new();
for node in nodes {
let subnodes = clean_node(&node, rules);
result.extend(subnodes);
}
result
fn clean_nodes(nodes: &[Rc<Node>], rules: &Rules) -> Vec<Rc<Node>> {
nodes
.iter()
.flat_map(|node| clean_node(node, rules))
.collect()
}

fn clean_node(node: &NodeRef, rules: &Rules) -> Vec<NodeRef> {
match node.data() {
NodeData::Document(..) => vec![],
NodeData::DocumentFragment => vec![], // TODO: ??
NodeData::Doctype(..) => vec![],
NodeData::ProcessingInstruction(..) => vec![],
fn clean_node(node: &Rc<Node>, rules: &Rules) -> Vec<Rc<Node>> {
match node.data {
NodeData::Document => vec![],
NodeData::Doctype { .. } => vec![],
NodeData::ProcessingInstruction { .. } => vec![],

NodeData::Text(..) => vec![node.clone()],
NodeData::Text { .. } => vec![node.clone()],

NodeData::Comment(..) => {
NodeData::Comment { .. } => {
if rules.allow_comments {
vec![node.clone()]
} else {
vec![]
}
}

NodeData::Element(ElementData {
NodeData::Element {
ref name,
ref attributes,
ref attrs,
..
}) => {
} => {
match element_action(name, rules) {
ElementAction::Keep(element_sanitizer) => {
let mut new_attrs: Vec<(ExpandedName, Attribute)> = Vec::new();
let mut new_attrs: Vec<Attribute> = Vec::new();

/* whitelisted attributes */
for (attr_name, attr_value) in attributes.borrow().map.iter() {
if element_sanitizer
.is_valid(&expanded_name_to_string(attr_name), &attr_value.value)
/* allowlisted attributes */
for attr in attrs.borrow().iter() {
if element_sanitizer.is_valid(&qual_name_to_string(&attr.name), &attr.value)
{
new_attrs.push((attr_name.clone(), attr_value.clone()));
new_attrs.push(attr.clone());
}
}

Expand All @@ -113,27 +104,28 @@ fn clean_node(node: &NodeRef, rules: &Rules) -> Vec<NodeRef> {
element_sanitizer.mandatory_attributes.iter().collect();
mandatory_attributes.sort();
for &(attr_name, attr_value) in mandatory_attributes.iter() {
new_attrs.push((
ExpandedName::new(ns!(), LocalName::from(attr_name.as_str())),
Attribute {
new_attrs.push(Attribute {
name: QualName {
prefix: None,
value: attr_value.into(),
ns: ns!(),
local: LocalName::from(attr_name.as_str()),
},
));
value: StrTendril::from(attr_value.as_str()),
});
}

let children = clean_nodes(node.children(), rules);
let children = clean_nodes(&node.children.borrow(), rules);
let element = simple_element(name.clone(), new_attrs, children);

vec![element]
}

ElementAction::Delete => vec![],

ElementAction::Elide => clean_nodes(node.children(), rules),
ElementAction::Elide => clean_nodes(&node.children.borrow(), rules),

ElementAction::Space => {
let mut nodes = clean_nodes(node.children(), rules);
let mut nodes = clean_nodes(&node.children.borrow(), rules);
if nodes.is_empty() {
nodes.push(create_space_text());
} else {
Expand All @@ -144,7 +136,7 @@ fn clean_node(node: &NodeRef, rules: &Rules) -> Vec<NodeRef> {
}

ElementAction::Rename(rename_to) => {
let children = clean_nodes(node.children(), rules);
let children = clean_nodes(&node.children.borrow(), rules);
vec![simple_element(
simple_qual_name(rename_to),
Vec::new(),
Expand All @@ -156,12 +148,10 @@ fn clean_node(node: &NodeRef, rules: &Rules) -> Vec<NodeRef> {
}
}

pub(crate) fn sanitize_dom(dom: &NodeRef, mode: &Rules) -> NodeRef {
let new_children = clean_nodes(dom.children(), mode);
let new_dom = NodeRef::new_document();
for child in new_children {
child.detach();
new_dom.append(child);
}
pub(crate) fn sanitize_dom(dom: &RcDom, mode: &Rules) -> Rc<Node> {
let new_children = clean_nodes(&dom.document.children.borrow(), mode);

let new_dom = Node::new(NodeData::Document);
new_dom.children.borrow_mut().extend(new_children);
new_dom
}

0 comments on commit 1ec0ae6

Please sign in to comment.