diff --git a/benches/tree_iterator.rs b/benches/tree_iterator.rs index 551363874..d0235d7eb 100644 --- a/benches/tree_iterator.rs +++ b/benches/tree_iterator.rs @@ -4,7 +4,7 @@ use criterion::{criterion_group, criterion_main, Criterion}; use gosub_html5::node::NodeId; use gosub_html5::parser::document::{Document, DocumentBuilder, TreeIterator}; use gosub_html5::parser::Html5Parser; -use gosub_shared::bytes::CharIterator; +use gosub_shared::byte_stream::ByteStream; fn wikipedia_main_page(c: &mut Criterion) { // Criterion can report inconsistent results from run to run in some cases. We attempt to @@ -14,13 +14,13 @@ fn wikipedia_main_page(c: &mut Criterion) { group.significance_level(0.1).sample_size(500); let html_file = File::open("tests/data/tree_iterator/wikipedia_main.html").unwrap(); - let mut char_iter = CharIterator::new(); - let _ = char_iter.read_from_file(html_file, Some(gosub_shared::bytes::Encoding::UTF8)); - char_iter.set_confidence(gosub_shared::bytes::Confidence::Certain); + let mut stream = ByteStream::new(); + let _ = stream.read_from_file(html_file, Some(gosub_shared::byte_stream::Encoding::UTF8)); + stream.set_confidence(gosub_shared::byte_stream::Confidence::Certain); let main_document = DocumentBuilder::new_document(None); let document = Document::clone(&main_document); - let _ = Html5Parser::parse_document(&mut char_iter, document, None); + let _ = Html5Parser::parse_document(&mut stream, document, None); group.bench_function("wikipedia main page", |b| { b.iter(|| { @@ -41,13 +41,13 @@ fn stackoverflow_home(c: &mut Criterion) { // using the main page of (english) wikipedia as a rough estimate of traversing a decently sized website let html_file = File::open("tests/data/tree_iterator/stackoverflow.html").unwrap(); - let mut char_iter = CharIterator::new(); - let _ = char_iter.read_from_file(html_file, Some(gosub_shared::bytes::Encoding::UTF8)); - char_iter.set_confidence(gosub_shared::bytes::Confidence::Certain); + let mut bytestream = ByteStream::new(); + let _ = bytestream.read_from_file(html_file, Some(gosub_shared::byte_stream::Encoding::UTF8)); + bytestream.set_confidence(gosub_shared::byte_stream::Confidence::Certain); let main_document = DocumentBuilder::new_document(None); let document = Document::clone(&main_document); - let _ = Html5Parser::parse_document(&mut char_iter, document, None); + let _ = Html5Parser::parse_document(&mut bytestream, document, None); group.bench_function("stackoverflow home", |b| { b.iter(|| { diff --git a/crates/gosub_bindings/src/lib.rs b/crates/gosub_bindings/src/lib.rs index 567d23bbc..95d4ac628 100644 --- a/crates/gosub_bindings/src/lib.rs +++ b/crates/gosub_bindings/src/lib.rs @@ -7,7 +7,7 @@ pub mod wrapper; use gosub_html5::parser::document::{Document, DocumentBuilder}; use gosub_html5::parser::Html5Parser; use gosub_rendering::render_tree::{Node, NodeType, RenderTree, TreeIterator}; -use gosub_shared::bytes::{CharIterator, Confidence, Encoding}; +use gosub_shared::byte_stream::{ByteStream, Confidence, Encoding}; use wrapper::node::CNode; /// Initialize a render tree and return an owning pointer to it. @@ -30,12 +30,13 @@ pub unsafe extern "C" fn gosub_rendertree_init(html: *const c_char) -> *mut Rend return ptr::null_mut(); } }; - let mut chars = CharIterator::new(); - chars.read_from_str(html_str, Some(Encoding::UTF8)); - chars.set_confidence(Confidence::Certain); + let mut stream = ByteStream::new(); + stream.read_from_str(html_str, Some(Encoding::UTF8)); + stream.set_confidence(Confidence::Certain); + stream.close(); let doc = DocumentBuilder::new_document(None); - let parse_result = Html5Parser::parse_document(&mut chars, Document::clone(&doc), None); + let parse_result = Html5Parser::parse_document(&mut stream, Document::clone(&doc), None); if parse_result.is_ok() { let mut rendertree = Box::new(RenderTree::new(&doc)); diff --git a/crates/gosub_css3/src/lib.rs b/crates/gosub_css3/src/lib.rs index 8f65a320f..adfd4105a 100644 --- a/crates/gosub_css3/src/lib.rs +++ b/crates/gosub_css3/src/lib.rs @@ -1,12 +1,10 @@ -use crate::location::Location; use crate::node::Node; use crate::parser_config::{Context, ParserConfig}; use crate::tokenizer::Tokenizer; -use gosub_shared::byte_stream::{ByteStream, Encoding, Stream}; +use gosub_shared::byte_stream::{ByteStream, Encoding, Location}; use gosub_shared::{timing_start, timing_stop}; pub mod convert; -pub mod location; mod node; pub mod parser; pub mod parser_config; @@ -47,11 +45,11 @@ impl<'stream> Css3<'stream> { pub fn parse(data: &str, config: ParserConfig) -> Result { let t_id = timing_start!("css3.parse", config.source.as_deref().unwrap_or("")); - let mut it = ByteStream::new(); - it.read_from_str(data, Some(Encoding::UTF8)); - it.close(); + let mut stream = ByteStream::new(); + stream.read_from_str(data, Some(Encoding::UTF8)); + stream.close(); - let mut parser = Css3::new(&mut it); + let mut parser = Css3::new(&mut stream); let ret = parser.parse_internal(config); timing_stop!(t_id); diff --git a/crates/gosub_css3/src/location.rs b/crates/gosub_css3/src/location.rs deleted file mode 100644 index 6fb8295f7..000000000 --- a/crates/gosub_css3/src/location.rs +++ /dev/null @@ -1,66 +0,0 @@ -use core::fmt::{Debug, Formatter}; - -/// Location holds the start position of the given element in the data source -#[derive(Clone, PartialEq)] -pub struct Location { - /// Line number, starting with 1 - line: u32, - /// Column number, starting with 1 - column: u32, - /// Byte offset, starting with 0 - offset: u32, -} - -impl Location { - pub(crate) fn inc_line(&mut self) { - self.line += 1; - } - pub(crate) fn inc_column(&mut self) { - self.column += 1; - } - pub(crate) fn set_column(&mut self, col: u32) { - self.column = col; - } - pub(crate) fn inc_offset(&mut self) { - self.offset += 1; - } -} - -impl Default for Location { - /// Default to line 1, column 1 - fn default() -> Self { - Self::new(1, 1, 0) - } -} - -impl Location { - /// Create a new Location - pub fn new(line: u32, column: u32, offset: u32) -> Self { - Self { - line, - column, - offset, - } - } - - /// Get the line number - pub fn line(&self) -> u32 { - self.line - } - - /// Get the column number - pub fn column(&self) -> u32 { - self.column - } - - /// Get the offset - pub fn offset(&self) -> u32 { - self.offset - } -} - -impl Debug for Location { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "({}:{})", self.line, self.column) - } -} diff --git a/crates/gosub_css3/src/node.rs b/crates/gosub_css3/src/node.rs index f150a0455..1b432f76b 100644 --- a/crates/gosub_css3/src/node.rs +++ b/crates/gosub_css3/src/node.rs @@ -1,8 +1,7 @@ use core::fmt::{Display, Formatter}; +use gosub_shared::byte_stream::Location; use std::ops::Deref; -use crate::location::Location; - pub type Number = f32; #[derive(Debug, Clone, PartialEq)] diff --git a/crates/gosub_css3/src/parser/anplusb.rs b/crates/gosub_css3/src/parser/anplusb.rs index 503d5d954..5e41dad43 100644 --- a/crates/gosub_css3/src/parser/anplusb.rs +++ b/crates/gosub_css3/src/parser/anplusb.rs @@ -250,15 +250,15 @@ impl Css3<'_> { #[cfg(test)] mod test { use super::*; - use gosub_shared::byte_stream::{ByteStream, Encoding, Stream}; + use gosub_shared::byte_stream::{ByteStream, Encoding}; macro_rules! test { ($func:ident, $input:expr, $expected:expr) => { - let mut it = ByteStream::new(); - it.read_from_str($input, Some(Encoding::UTF8)); - it.close(); + let mut stream = ByteStream::new(); + stream.read_from_str($input, Some(Encoding::UTF8)); + stream.close(); - let mut parser = crate::Css3::new(&mut it); + let mut parser = crate::Css3::new(&mut stream); let result = parser.$func().unwrap(); assert_eq!(result.node_type, $expected); diff --git a/crates/gosub_css3/src/parser/at_rule/supports.rs b/crates/gosub_css3/src/parser/at_rule/supports.rs index 4b1b6a9f1..34c20f670 100644 --- a/crates/gosub_css3/src/parser/at_rule/supports.rs +++ b/crates/gosub_css3/src/parser/at_rule/supports.rs @@ -17,15 +17,15 @@ impl Css3<'_> { #[cfg(test)] mod tests { use crate::walker::Walker; - use gosub_shared::byte_stream::{ByteStream, Encoding, Stream}; + use gosub_shared::byte_stream::{ByteStream, Encoding}; #[test] fn test_parse_at_rule_supports_prelude() { - let mut it = ByteStream::new(); - it.read_from_str("(display: flex)", Some(Encoding::UTF8)); - it.close(); - let mut parser = crate::Css3::new(&mut it); + let mut stream = ByteStream::new(); + stream.read_from_str("(display: flex)", Some(Encoding::UTF8)); + stream.close(); + let mut parser = crate::Css3::new(&mut stream); let node = parser.parse_at_rule_supports_prelude().unwrap(); let w = Walker::new(&node); diff --git a/crates/gosub_css3/src/parser/rule.rs b/crates/gosub_css3/src/parser/rule.rs index 82b25a6b3..bc7133199 100644 --- a/crates/gosub_css3/src/parser/rule.rs +++ b/crates/gosub_css3/src/parser/rule.rs @@ -30,15 +30,15 @@ impl Css3<'_> { #[cfg(test)] mod tests { use crate::walker::Walker; - use gosub_shared::byte_stream::{ByteStream, Encoding, Stream}; + use gosub_shared::byte_stream::{ByteStream, Encoding}; macro_rules! test { ($func:ident, $input:expr, $expected:expr) => { - let mut it = ByteStream::new(); - it.read_from_str($input, Some(Encoding::UTF8)); - it.close(); + let mut stream = ByteStream::new(); + stream.read_from_str($input, Some(Encoding::UTF8)); + stream.close(); - let mut parser = crate::Css3::new(&mut it); + let mut parser = crate::Css3::new(&mut stream); let result = parser.$func().unwrap(); let w = Walker::new(&result); diff --git a/crates/gosub_css3/src/parser/url.rs b/crates/gosub_css3/src/parser/url.rs index 0595fd05e..3103264a5 100644 --- a/crates/gosub_css3/src/parser/url.rs +++ b/crates/gosub_css3/src/parser/url.rs @@ -36,15 +36,15 @@ impl Css3<'_> { #[cfg(test)] mod tests { use crate::walker::Walker; - use gosub_shared::byte_stream::{ByteStream, Encoding, Stream}; + use gosub_shared::byte_stream::{ByteStream, Encoding}; macro_rules! test { ($func:ident, $input:expr, $expected:expr) => { - let mut it = ByteStream::new(); - it.read_from_str($input, Some(Encoding::UTF8)); - it.close(); + let mut stream = ByteStream::new(); + stream.read_from_str($input, Some(Encoding::UTF8)); + stream.close(); - let mut parser = crate::Css3::new(&mut it); + let mut parser = crate::Css3::new(&mut stream); let result = parser.$func().unwrap(); let w = Walker::new(&result); @@ -54,11 +54,11 @@ mod tests { macro_rules! test_err { ($func:ident, $input:expr, $expected:expr) => { - let mut it = ByteStream::new(); - it.read_from_str($input, Some(Encoding::UTF8)); - it.close(); + let mut stream = ByteStream::new(); + stream.read_from_str($input, Some(Encoding::UTF8)); + stream.close(); - let mut parser = crate::Css3::new(&mut it); + let mut parser = crate::Css3::new(&mut stream); let result = parser.$func(); assert_eq!(true, result.is_err()); diff --git a/crates/gosub_css3/src/parser_config.rs b/crates/gosub_css3/src/parser_config.rs index 8fe9617b1..b4a7b4c8c 100644 --- a/crates/gosub_css3/src/parser_config.rs +++ b/crates/gosub_css3/src/parser_config.rs @@ -1,4 +1,4 @@ -use crate::location::Location; +use gosub_shared::byte_stream::Location; /// Context defines how the data needs to be parsed pub enum Context { diff --git a/crates/gosub_css3/src/tokenizer.rs b/crates/gosub_css3/src/tokenizer.rs index 0c4f1001a..5e015ff0e 100644 --- a/crates/gosub_css3/src/tokenizer.rs +++ b/crates/gosub_css3/src/tokenizer.rs @@ -1,7 +1,7 @@ -use crate::location::Location; use crate::unicode::{get_unicode_char, UnicodeChar}; use gosub_shared::byte_stream::Character::Ch; -use gosub_shared::byte_stream::{ByteStream, Character, Stream}; +use gosub_shared::byte_stream::{ByteStream, Character}; +use gosub_shared::byte_stream::{Location, LocationHandler, Stream}; use std::fmt; pub type Number = f32; @@ -224,38 +224,28 @@ pub struct Tokenizer<'stream> { position: usize, /// Full list of all tokens produced by the tokenizer tokens: Vec, - /// List of all line endings - line_endings: Vec, - /// Start position of the stream (this does not have to be 1/1) - start_location: Location, - /// Current position of the stream, to get the absolute position, we must add start_location to it - cur_location: Location, - /// WHen true, the stream is closed and no more tokens can be produced + /// Handles line/col + location_handler: LocationHandler, + /// When true, the stream is closed and no more tokens can be produced eof: bool, } impl<'stream> Tokenizer<'stream> { /// Creates a new tokenizer with the given stream that starts on the given location. This does not have /// to be 1/1, but can be any location. - pub fn new(stream: &'stream mut ByteStream, location: Location) -> Self { + pub fn new(stream: &'stream mut ByteStream, start_location: Location) -> Self { Self { stream, position: 0, tokens: Vec::new(), - start_location: location.clone(), - cur_location: Location::new(1, 1, 0), + location_handler: LocationHandler::new(start_location), eof: false, - line_endings: Vec::new(), } } - /// Returns the current location and takes the start location into account + /// Returns the current location (line/col) of the tokenizer pub fn current_location(&self) -> Location { - Location::new( - self.start_location.line() + self.cur_location.line() - 1, - self.start_location.column() + self.cur_location.column() - 1, - self.cur_location.offset(), - ) + self.location_handler.cur_location.clone() } /// Returns true when there is no next element, and the stream is closed @@ -328,7 +318,7 @@ impl<'stream> Tokenizer<'stream> { pub fn reconsume(&mut self) { if self.position > 0 { self.position -= 1; - self.cur_location = self.tokens[self.position].location.clone(); + self.location_handler.dec(); } } @@ -906,37 +896,29 @@ impl<'stream> Tokenizer<'stream> { } pub fn tell(&self) -> usize { - self.cur_location.offset() as usize + self.stream.offset() } - pub fn slice(&self, start: usize, end: usize) -> String { - let mut s = String::new(); - for c in self.stream.get_slice(start, end) { + pub fn slice(&mut self, start: usize, end: usize) -> String { + let old_pos = self.stream.offset(); + self.stream.seek(start); + + // todo: this is not efficient + let mut s = String::with_capacity(end - start); + for c in self.stream.get_slice(end - start) { if let Ch(c) = c { s.push(*c); } } + self.stream.seek(old_pos); + s } fn next_char(&mut self) -> Character { - if self.stream.eof() { - return Character::StreamEnd; - } - - let c = self.stream.read(); - self.cur_location.inc_offset(); - if c == Ch('\n') { - self.cur_location.inc_line(); - self.cur_location.set_column(1); - } else { - self.cur_location.inc_column(); - } - - // advance position in the stream - self.stream.next(); - + let c = self.stream.read_and_next(); + self.location_handler.inc(c); c } @@ -967,14 +949,14 @@ mod test { #[test] fn parse_comment() { - let mut chars = ByteStream::new(); - chars.read_from_str("/* css comment */", Some(Encoding::UTF8)); - chars.close(); + let mut stream = ByteStream::new(); + stream.read_from_str("/* css comment */", Some(Encoding::UTF8)); + stream.close(); - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); tokenizer.consume_comment(); - assert!(chars.eof()) + assert!(stream.eof()) } #[test] @@ -1222,13 +1204,13 @@ mod test { #[test] fn produce_stream_of_double_quoted_strings() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "\"\" \"Lorem 'îpsum'\" \"a\\\nb\" \"a\nb \"eof", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ // `\"\"` @@ -1246,7 +1228,7 @@ mod test { Token::new(TokenType::Whitespace, Location::default()), Token::new_quoted_string("eof", Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1257,13 +1239,13 @@ mod test { #[test] fn procude_stream_of_single_quoted_strings() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "'' 'Lorem \"îpsum\"' 'a\\\nb' 'a\nb 'eof", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ // `\"\"` @@ -1281,7 +1263,7 @@ mod test { Token::new(TokenType::Whitespace, Location::default()), Token::new_quoted_string("eof", Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1292,13 +1274,13 @@ mod test { #[test] fn parse_urls_with_strings() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "url( '') url('Lorem \"îpsum\"'\n) url('a\\\nb' ) url('a\nb) url('eof", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ // `url( '')` @@ -1329,7 +1311,7 @@ mod test { Token::new_function("url", Location::default()), Token::new_quoted_string("eof", Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1340,9 +1322,9 @@ mod test { #[test] fn produce_valid_stream_of_css_tokens() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( " /* Navbar */ #header .nav { @@ -1357,7 +1339,7 @@ mod test { ", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ // 1st css rule @@ -1405,7 +1387,7 @@ mod test { Token::new(TokenType::Whitespace, Location::default()), Token::new_url("https://gosub.io", Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); tokenizer.consume_whitespace(); for token in tokens { @@ -1415,15 +1397,15 @@ mod test { #[test] fn parse_rgba_expr() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( " rgba(255, 50%, 0%, 1) ", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ Token::new(TokenType::Whitespace, Location::default()), @@ -1441,7 +1423,7 @@ mod test { Token::new(TokenType::RParen, Location::default()), Token::new(TokenType::Whitespace, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1450,13 +1432,13 @@ mod test { #[test] fn parse_cdo_and_cdc() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "/* CDO/CDC are not special */ {}", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ Token::new(TokenType::Whitespace, Location::default()), @@ -1467,7 +1449,7 @@ mod test { Token::new(TokenType::LCurly, Location::default()), Token::new(TokenType::RCurly, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1476,10 +1458,10 @@ mod test { #[test] fn parse_spaced_comments() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str("/*/*///** /* **/*//* ", Some(Encoding::UTF8)); - chars.close(); + stream.read_from_str("/*/*///** /* **/*//* ", Some(Encoding::UTF8)); + stream.close(); let tokens = vec![ Token::new_delim('/', Location::default()), @@ -1487,7 +1469,7 @@ mod test { Token::new_delim('/', Location::default()), Token::new(TokenType::Eof, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { let t = tokenizer.consume_token(); @@ -1499,10 +1481,10 @@ mod test { #[test] fn parse_all_whitespaces() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str(" \t\t\r\n\nRed ", Some(Encoding::UTF8)); - chars.close(); + stream.read_from_str(" \t\t\r\n\nRed ", Some(Encoding::UTF8)); + stream.close(); let tokens = vec![ Token::new(TokenType::Whitespace, Location::default()), @@ -1510,7 +1492,7 @@ mod test { Token::new(TokenType::Whitespace, Location::default()), Token::new(TokenType::Eof, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1521,13 +1503,13 @@ mod test { #[test] fn parse_at_keywords() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "@media0 @-Media @--media @0media @-0media @_media @.media @medİa @\\30 media\\", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ Token::new_atkeyword("media0", Location::default()), @@ -1559,7 +1541,7 @@ mod test { Token::new_atkeyword("0media\u{FFFD}", Location::default()), Token::new(TokenType::Eof, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1570,13 +1552,13 @@ mod test { #[test] fn parse_id_selectors() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "#red0 #-Red #--red #-\\-red #0red #-0red #_Red #.red #rêd #êrd #\\.red\\", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ Token::new_id_hash("red0", Location::default()), @@ -1612,7 +1594,7 @@ mod test { Token::new_id_hash(".red\u{FFFD}", Location::default()), Token::new(TokenType::Eof, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1623,13 +1605,13 @@ mod test { #[test] fn parse_dimension_tokens() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "12red0 12.0-red 12--red 12-\\-red 120red 12-0red 12\\0000red 12_Red 12.red 12rêd", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ // `12red0` @@ -1665,7 +1647,7 @@ mod test { // `12rêd` Token::new_dimension(12.0, "rêd", Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1676,13 +1658,13 @@ mod test { #[test] fn parse_dimension_tokens_2() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "12e2px +34e+1px -45E-0px .68e+3px +.79e-1px -.01E2px 2.3E+1px +45.0e6px -0.67e0px", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ // `12e2px` @@ -1713,7 +1695,7 @@ mod test { Token::new_dimension(-0.67, "px", Location::default()), Token::new(TokenType::Eof, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1724,13 +1706,13 @@ mod test { #[test] fn parse_percentage() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "12e2% +34e+1% -45E-0% .68e+3% +.79e-1% -.01E2% 2.3E+1% +45.0e6% -0.67e0%", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ // `12e2%` @@ -1761,7 +1743,7 @@ mod test { Token::new_percentage(-0.67, Location::default()), Token::new(TokenType::Eof, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1772,13 +1754,13 @@ mod test { #[test] fn parse_css_seq_1() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "a:not([href^=http\\:], [href ^=\t'https\\:'\n]) { color: rgba(0%, 100%, 50%); }", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ Token::new_ident("a", Location::default()), @@ -1821,7 +1803,7 @@ mod test { Token::new(TokenType::Whitespace, Location::default()), Token::new(TokenType::RCurly, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1832,10 +1814,10 @@ mod test { #[test] fn parse_css_seq_2() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str("red-->/* Not CDC */", Some(Encoding::UTF8)); - chars.close(); + stream.read_from_str("red-->/* Not CDC */", Some(Encoding::UTF8)); + stream.close(); let tokens = vec![ Token::new_ident("red--", Location::default()), @@ -1850,7 +1832,7 @@ mod test { // get the correct result. Token::new(TokenType::Eof, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1861,10 +1843,10 @@ mod test { #[test] fn parse_css_seq_3() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str("\\- red0 -red --red -\\-red\\ blue 0red -0red \\0000red _Red .red rêd r\\êd \\007F\\0080\\0081", Some(Encoding::UTF8)); - chars.close(); + stream.read_from_str("\\- red0 -red --red -\\-red\\ blue 0red -0red \\0000red _Red .red rêd r\\êd \\007F\\0080\\0081", Some(Encoding::UTF8)); + stream.close(); let tokens = vec![ // `\\-` @@ -1907,7 +1889,7 @@ mod test { // `\\007F\\0080\\0081` Token::new_ident("\u{7f}\u{80}\u{81}", Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1918,13 +1900,13 @@ mod test { #[test] fn parse_css_seq_4() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "p[example=\"\\\nfoo(int x) {\\\n this.x = x;\\\n}\\\n\"]", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ Token::new_ident("p", Location::default()), @@ -1934,7 +1916,7 @@ mod test { Token::new_quoted_string("foo(int x) { this.x = x;}", Location::default()), Token::new(TokenType::RBracket, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -1945,11 +1927,11 @@ mod test { #[test] fn consume_tokenizer_as_stream_of_tokens() { - let mut chars = ByteStream::new(); - chars.read_from_str("[][]", Some(Encoding::UTF8)); - chars.close(); + let mut stream = ByteStream::new(); + stream.read_from_str("[][]", Some(Encoding::UTF8)); + stream.close(); - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); tokenizer.consume_all(); assert_token_eq!( @@ -1977,13 +1959,13 @@ mod test { #[test] fn parse_css_seq_5() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "test { color: #123; background-color: #11223344 }", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ Token::new_ident("test", Location::default()), @@ -2003,7 +1985,7 @@ mod test { Token::new(TokenType::Whitespace, Location::default()), Token::new(TokenType::RCurly, Location::default()), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { assert_token_eq!(tokenizer.consume_token(), token); @@ -2014,13 +1996,13 @@ mod test { #[test] fn location() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "test { color: #123; background-color: #11223344 }", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ Token::new_ident("test", Location::new(1, 1, 0)), @@ -2040,7 +2022,7 @@ mod test { Token::new(TokenType::Whitespace, Location::new(1, 48, 47)), Token::new(TokenType::RCurly, Location::new(1, 49, 48)), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { let t = tokenizer.consume_token(); @@ -2053,13 +2035,13 @@ mod test { #[test] fn location_multiline() { - let mut chars = ByteStream::new(); + let mut stream = ByteStream::new(); - chars.read_from_str( + stream.read_from_str( "test {\n color: #123;\n background-color: #11223344\n}", Some(Encoding::UTF8), ); - chars.close(); + stream.close(); let tokens = vec![ Token::new_ident("test", Location::new(1, 1, 0)), @@ -2079,7 +2061,7 @@ mod test { Token::new(TokenType::Whitespace, Location::new(3, 32, 55)), Token::new(TokenType::RCurly, Location::new(4, 1, 56)), ]; - let mut tokenizer = Tokenizer::new(&mut chars, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); for token in tokens { let t = tokenizer.consume_token(); diff --git a/crates/gosub_html5/src/error_logger.rs b/crates/gosub_html5/src/error_logger.rs index 298a52ebc..2f5304238 100755 --- a/crates/gosub_html5/src/error_logger.rs +++ b/crates/gosub_html5/src/error_logger.rs @@ -1,4 +1,4 @@ -use gosub_shared::bytes::Position; +use gosub_shared::byte_stream::Location; use gosub_shared::types::ParseError; /// Possible parser error enumerated @@ -42,6 +42,7 @@ pub enum ParserError { NoncharacterInInputStream, NonVoidHtmlElementStartTagWithTrailingSolidus, NullCharacterReference, + SelfClosingFlagOnEndTag, SurrogateCharacterReference, SurrogateInInputStream, UnexpectedCharacterAfterDoctypeSystemIdentifier, @@ -75,6 +76,7 @@ impl ParserError { ParserError::ControlCharacterReference => "control-character-reference", ParserError::EndTagWithAttributes => "end-tag-with-attributes", ParserError::DuplicateAttribute => "duplicate-attribute", + ParserError::SelfClosingFlagOnEndTag => "self-closing-flag-on-end-tag", ParserError::EndTagWithTrailingSolidus => "end-tag-with-trailing-solidus", ParserError::EofBeforeTagName => "eof-before-tag-name", ParserError::EofInCdata => "eof-in-cdata", @@ -178,22 +180,18 @@ impl ErrorLogger { } /// Adds a new error to the error logger - pub fn add_error(&mut self, pos: Position, message: &str) { + pub fn add_error(&mut self, location: Location, message: &str) { // Check if the error already exists, if so, don't add it again for err in &self.errors { - if err.line == pos.line && err.col == pos.col && err.message == *message { + if err.location == location && err.message == *message { return; } } self.errors.push(ParseError { - line: pos.line, - col: pos.col, - offset: pos.offset, message: message.to_string(), + location: location.clone(), }); - - // println!("Parse error ({}/{}): {}", pos.line, pos.col, message); } } @@ -206,11 +204,11 @@ mod tests { fn test_error_logger() { let mut logger = ErrorLogger::new(); - logger.add_error(Position::new(1, 1, 0), "test"); - logger.add_error(Position::new(1, 1, 0), "test"); - logger.add_error(Position::new(1, 1, 0), "test"); - logger.add_error(Position::new(1, 1, 0), "test"); - logger.add_error(Position::new(1, 1, 0), "test"); + logger.add_error(Location::new(1, 1, 0), "test"); + logger.add_error(Location::new(1, 1, 0), "test"); + logger.add_error(Location::new(1, 1, 0), "test"); + logger.add_error(Location::new(1, 1, 0), "test"); + logger.add_error(Location::new(1, 1, 0), "test"); assert_eq!(logger.get_errors().len(), 1); } @@ -219,11 +217,11 @@ mod tests { fn test_error_logger2() { let mut logger = ErrorLogger::new(); - logger.add_error(Position::new(1, 1, 0), "test"); - logger.add_error(Position::new(1, 2, 0), "test"); - logger.add_error(Position::new(1, 3, 0), "test"); - logger.add_error(Position::new(1, 4, 0), "test"); - logger.add_error(Position::new(1, 5, 0), "test"); + logger.add_error(Location::new(1, 1, 0), "test"); + logger.add_error(Location::new(1, 2, 0), "test"); + logger.add_error(Location::new(1, 3, 0), "test"); + logger.add_error(Location::new(1, 4, 0), "test"); + logger.add_error(Location::new(1, 5, 0), "test"); assert_eq!(logger.get_errors().len(), 5); } @@ -232,15 +230,15 @@ mod tests { fn test_error_logger3() { let mut logger = ErrorLogger::new(); - logger.add_error(Position::new(1, 1, 0), "test"); - logger.add_error(Position::new(1, 2, 0), "test"); - logger.add_error(Position::new(1, 3, 0), "test"); - logger.add_error(Position::new(1, 4, 0), "test"); - logger.add_error(Position::new(1, 5, 0), "test"); - logger.add_error(Position::new(1, 5, 0), "test"); - logger.add_error(Position::new(1, 5, 0), "test"); - logger.add_error(Position::new(1, 5, 0), "test"); - logger.add_error(Position::new(1, 5, 0), "test"); + logger.add_error(Location::new(1, 1, 0), "test"); + logger.add_error(Location::new(1, 2, 0), "test"); + logger.add_error(Location::new(1, 3, 0), "test"); + logger.add_error(Location::new(1, 4, 0), "test"); + logger.add_error(Location::new(1, 5, 0), "test"); + logger.add_error(Location::new(1, 5, 0), "test"); + logger.add_error(Location::new(1, 5, 0), "test"); + logger.add_error(Location::new(1, 5, 0), "test"); + logger.add_error(Location::new(1, 5, 0), "test"); assert_eq!(logger.get_errors().len(), 5); } @@ -249,24 +247,24 @@ mod tests { fn test_error_logger4() { let mut logger = ErrorLogger::new(); - logger.add_error(Position::new(0, 1, 1), "test"); - logger.add_error(Position::new(0, 1, 2), "test"); - logger.add_error(Position::new(0, 1, 3), "test"); - logger.add_error(Position::new(0, 1, 4), "test"); - logger.add_error(Position::new(0, 1, 5), "test"); - logger.add_error(Position::new(0, 1, 5), "test"); - logger.add_error(Position::new(0, 1, 5), "test"); - logger.add_error(Position::new(0, 1, 5), "test"); - logger.add_error(Position::new(0, 1, 5), "test"); - logger.add_error(Position::new(0, 2, 1), "test"); - logger.add_error(Position::new(0, 2, 2), "test"); - logger.add_error(Position::new(0, 2, 3), "test"); - logger.add_error(Position::new(0, 2, 4), "test"); - logger.add_error(Position::new(0, 2, 5), "test"); - logger.add_error(Position::new(0, 2, 5), "test"); - logger.add_error(Position::new(0, 2, 5), "test"); - logger.add_error(Position::new(0, 2, 5), "test"); - logger.add_error(Position::new(0, 2, 5), "test"); + logger.add_error(Location::new(0, 1, 1), "test"); + logger.add_error(Location::new(0, 1, 2), "test"); + logger.add_error(Location::new(0, 1, 3), "test"); + logger.add_error(Location::new(0, 1, 4), "test"); + logger.add_error(Location::new(0, 1, 5), "test"); + logger.add_error(Location::new(0, 1, 5), "test"); + logger.add_error(Location::new(0, 1, 5), "test"); + logger.add_error(Location::new(0, 1, 5), "test"); + logger.add_error(Location::new(0, 1, 5), "test"); + logger.add_error(Location::new(0, 2, 1), "test"); + logger.add_error(Location::new(0, 2, 2), "test"); + logger.add_error(Location::new(0, 2, 3), "test"); + logger.add_error(Location::new(0, 2, 4), "test"); + logger.add_error(Location::new(0, 2, 5), "test"); + logger.add_error(Location::new(0, 2, 5), "test"); + logger.add_error(Location::new(0, 2, 5), "test"); + logger.add_error(Location::new(0, 2, 5), "test"); + logger.add_error(Location::new(0, 2, 5), "test"); assert_eq!(logger.get_errors().len(), 10); } diff --git a/crates/gosub_html5/src/errors.rs b/crates/gosub_html5/src/errors.rs index ccbf5cf30..5fa941a27 100644 --- a/crates/gosub_html5/src/errors.rs +++ b/crates/gosub_html5/src/errors.rs @@ -1,4 +1,5 @@ //! Error results that can be returned from the engine +use gosub_shared::byte_stream::Location; use thiserror::Error; /// Parser error that defines an error (message) on the given position @@ -6,12 +7,7 @@ use thiserror::Error; pub struct ParseError { /// Parse error message pub message: String, - /// Line number (1-based) of the error - pub line: usize, - // Column (1-based) on line of the error - pub col: usize, - // Position (0-based) of the error in the input stream - pub offset: usize, + pub location: Location, } /// Serious errors and errors from third-party libraries diff --git a/crates/gosub_html5/src/parser.rs b/crates/gosub_html5/src/parser.rs index 84c31f34b..e3e2351d2 100644 --- a/crates/gosub_html5/src/parser.rs +++ b/crates/gosub_html5/src/parser.rs @@ -12,7 +12,7 @@ use gosub_css3::convert::ast_converter::convert_ast_to_stylesheet; use gosub_css3::parser_config::ParserConfig; use gosub_css3::stylesheet::{CssOrigin, CssStylesheet}; use gosub_css3::Css3; -use gosub_shared::bytes::CharIterator; +use gosub_shared::byte_stream::ByteStream; use gosub_shared::types::{ParseError, Result}; use gosub_shared::{timing_start, timing_stop}; @@ -282,10 +282,10 @@ impl<'chars> Html5Parser<'chars> { /// Creates a new parser with a dummy document and dummy tokenizer. This is ONLY used for testing purposes. /// Regular users should use the parse_document() and parse_fragment() functions instead. - pub fn new_parser(chars: &'chars mut CharIterator) -> Self { + pub fn new_parser(stream: &'chars mut ByteStream) -> Self { let doc = DocumentBuilder::new_document(None); let error_logger = Rc::new(RefCell::new(ErrorLogger::new())); - let tokenizer = Tokenizer::new(chars, None, error_logger.clone()); + let tokenizer = Tokenizer::new(stream, None, error_logger.clone()); Self { tokenizer, @@ -322,7 +322,7 @@ impl<'chars> Html5Parser<'chars> { /// Parses a fragment of HTML instead of a whole document. It will run the parser in a slightly different mode. /// This is used for parsing innerHTML and document fragments. pub fn parse_fragment( - chars: &mut CharIterator, + stream: &mut ByteStream, mut document: DocumentHandle, context_node: &Node, options: Option, @@ -341,7 +341,7 @@ impl<'chars> Html5Parser<'chars> { // 3. let error_logger = Rc::new(RefCell::new(ErrorLogger::new())); - let tokenizer = Tokenizer::new(chars, None, error_logger.clone()); + let tokenizer = Tokenizer::new(stream, None, error_logger.clone()); let mut parser = Html5Parser::init(tokenizer, Document::clone(&document), error_logger, options); @@ -403,7 +403,7 @@ impl<'chars> Html5Parser<'chars> { /// node that should not be used. The children of the root-node should be used on the context /// node where this document fragment needs to be inserted into. pub fn parse_document( - chars: &mut CharIterator, + stream: &mut ByteStream, document: DocumentHandle, options: Option, ) -> Result> { @@ -414,7 +414,7 @@ impl<'chars> Html5Parser<'chars> { Some(location) => timing_start!("html5.parse", location), None => timing_start!("html5.parse", "unknown"), }; - let tokenizer = Tokenizer::new(chars, None, error_logger.clone()); + let tokenizer = Tokenizer::new(stream, None, error_logger.clone()); let mut parser = Html5Parser::init(tokenizer, document, error_logger, options); let ret = parser.do_parse(); @@ -639,7 +639,7 @@ impl<'chars> Html5Parser<'chars> { self.open_elements.pop(); let old_insertion_point = self.insertion_point; - self.insertion_point = Some(self.tokenizer.get_position().offset); + self.insertion_point = Some(self.tokenizer.get_location().offset); self.script_nesting_level += 1; @@ -1055,7 +1055,7 @@ impl<'chars> Html5Parser<'chars> { self.insertion_mode = self.original_insertion_mode; let old_insertion_point = self.insertion_point; - self.insertion_point = Some(self.tokenizer.get_position().offset); + self.insertion_point = Some(self.tokenizer.get_location().offset); self.script_nesting_level += 1; @@ -1870,7 +1870,7 @@ impl<'chars> Html5Parser<'chars> { fn parse_error(&self, message: &str) { self.error_logger .borrow_mut() - .add_error(self.tokenizer.get_position(), message); + .add_error(self.tokenizer.get_location(), message); } /// Create a new node that is not connected or attached to the document arena @@ -4300,7 +4300,7 @@ impl<'chars> Html5Parser<'chars> { #[cfg(test)] mod test { - use gosub_shared::bytes::Encoding; + use gosub_shared::byte_stream::Encoding; use crate::parser::document::DocumentBuilder; @@ -4319,8 +4319,8 @@ mod test { #[test] fn is_in_scope() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); node_create!(parser, "html"); node_create!(parser, "div"); @@ -4334,8 +4334,8 @@ mod test { #[test] fn is_in_scope_empty_stack() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); parser.open_elements.clear(); assert!(!parser.is_in_scope("p", HTML_NAMESPACE, Scope::Regular)); @@ -4346,8 +4346,8 @@ mod test { #[test] fn is_in_scope_non_existing_node() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); node_create!(parser, "html"); node_create!(parser, "div"); @@ -4362,8 +4362,8 @@ mod test { #[test] fn is_in_scope_1() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); node_create!(parser, "html"); node_create!(parser, "div"); @@ -4400,8 +4400,8 @@ mod test { #[test] fn is_in_scope_2() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); node_create!(parser, "html"); node_create!(parser, "body"); @@ -4419,8 +4419,8 @@ mod test { #[test] fn is_in_scope_3() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); node_create!(parser, "html"); node_create!(parser, "body"); @@ -4438,8 +4438,8 @@ mod test { #[test] fn is_in_scope_4() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); node_create!(parser, "html"); node_create!(parser, "body"); @@ -4459,8 +4459,8 @@ mod test { #[test] fn is_in_scope_5() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); node_create!(parser, "html"); node_create!(parser, "body"); @@ -4479,8 +4479,8 @@ mod test { #[test] fn is_in_scope_6() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); node_create!(parser, "html"); node_create!(parser, "body"); @@ -4499,8 +4499,8 @@ mod test { #[test] fn is_in_scope_7() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); node_create!(parser, "html"); node_create!(parser, "body"); @@ -4518,8 +4518,8 @@ mod test { #[test] fn is_in_scope_8() { - let chars = &mut CharIterator::new(); - let mut parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let mut parser = Html5Parser::new_parser(stream); node_create!(parser, "html"); node_create!(parser, "body"); @@ -4536,25 +4536,27 @@ mod test { #[test] fn reconstruct_formatting() { - let mut chars = CharIterator::new(); - chars.read_from_str( + let mut stream = ByteStream::new(); + stream.read_from_str( "

boldbold and italicitalic

", Some(Encoding::UTF8), ); + stream.close(); let document = DocumentBuilder::new_document(None); - let _ = Html5Parser::parse_document(&mut chars, Document::clone(&document), None); + let _ = Html5Parser::parse_document(&mut stream, Document::clone(&document), None); println!("{}", document); } #[test] fn element_with_classes() { - let mut chars = CharIterator::new(); - chars.read_from_str("
", Some(Encoding::UTF8)); + let mut stream = ByteStream::new(); + stream.read_from_str("
", Some(Encoding::UTF8)); + stream.close(); let document = DocumentBuilder::new_document(None); - let _ = Html5Parser::parse_document(&mut chars, Document::clone(&document), None); + let _ = Html5Parser::parse_document(&mut stream, Document::clone(&document), None); let binding = document.get(); @@ -4578,14 +4580,15 @@ mod test { #[test] fn element_with_classes_extra_whitespace() { - let mut chars = CharIterator::new(); - chars.read_from_str( + let mut stream = ByteStream::new(); + stream.read_from_str( "
", Some(Encoding::UTF8), ); + stream.close(); let document = DocumentBuilder::new_document(None); - let _ = Html5Parser::parse_document(&mut chars, Document::clone(&document), None); + let _ = Html5Parser::parse_document(&mut stream, Document::clone(&document), None); let binding = document.get(); @@ -4609,15 +4612,16 @@ mod test { #[test] fn element_with_invalid_named_id() { - let mut chars = CharIterator::new(); - chars.read_from_str( + let mut stream = ByteStream::new(); + stream.read_from_str( "
\
", Some(Encoding::UTF8), ); + stream.close(); let document = DocumentBuilder::new_document(None); - let _ = Html5Parser::parse_document(&mut chars, Document::clone(&document), None); + let _ = Html5Parser::parse_document(&mut stream, Document::clone(&document), None); assert!(document.get().get_node_by_named_id("my id").is_none()); assert!(document.get().get_node_by_named_id("").is_none()); @@ -4625,15 +4629,16 @@ mod test { #[test] fn element_with_named_id() { - let mut chars = CharIterator::new(); - chars.read_from_str( + let mut stream = ByteStream::new(); + stream.read_from_str( "
\

", Some(Encoding::UTF8), ); + stream.close(); let document = DocumentBuilder::new_document(None); - let _ = Html5Parser::parse_document(&mut chars, Document::clone(&document), None); + let _ = Html5Parser::parse_document(&mut stream, Document::clone(&document), None); // we are expecting the div (ID: 4) and p would be ignored let doc_read = document.get(); diff --git a/crates/gosub_html5/src/parser/quirks.rs b/crates/gosub_html5/src/parser/quirks.rs index b9163357b..5f9cd35a2 100644 --- a/crates/gosub_html5/src/parser/quirks.rs +++ b/crates/gosub_html5/src/parser/quirks.rs @@ -157,12 +157,12 @@ static LIMITED_QUIRKS_PUB_IDENTIFIER_PREFIX_NOT_MISSING_SYS: &[&str] = &[ mod tests { use crate::parser::Html5Parser; use crate::parser::QuirksMode; - use gosub_shared::bytes::CharIterator; + use gosub_shared::byte_stream::ByteStream; #[test] fn test_quirks_mode() { - let chars = &mut CharIterator::new(); - let parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let parser = Html5Parser::new_parser(stream); assert_eq!( parser.identify_quirks_mode(&None, None, None, false), @@ -247,8 +247,8 @@ mod tests { #[test] fn test_quirks_mode_force() { - let chars = &mut CharIterator::new(); - let parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let parser = Html5Parser::new_parser(stream); assert_eq!( parser.identify_quirks_mode(&Some("html".to_string()), None, None, true), @@ -321,8 +321,8 @@ mod tests { #[test] fn test_quirks_mode_sys() { - let chars = &mut CharIterator::new(); - let parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let parser = Html5Parser::new_parser(stream); assert_eq!( parser.identify_quirks_mode( @@ -346,8 +346,8 @@ mod tests { #[test] fn test_quirks_mode_sys_missing() { - let chars = &mut CharIterator::new(); - let parser = Html5Parser::new_parser(chars); + let stream = &mut ByteStream::new(); + let parser = Html5Parser::new_parser(stream); assert_eq!( parser.identify_quirks_mode( diff --git a/crates/gosub_html5/src/parser/tree_builder.rs b/crates/gosub_html5/src/parser/tree_builder.rs index f532f906b..0213c5807 100644 --- a/crates/gosub_html5/src/parser/tree_builder.rs +++ b/crates/gosub_html5/src/parser/tree_builder.rs @@ -114,7 +114,7 @@ mod tests { .expect("problem parsing"); println!( - "tree construction: {}:{} {}", + "tree construction: {}:{}\n{}", test.file_path, test.line, test.document_as_str() diff --git a/crates/gosub_html5/src/tokenizer.rs b/crates/gosub_html5/src/tokenizer.rs index e26372d33..8dc239347 100644 --- a/crates/gosub_html5/src/tokenizer.rs +++ b/crates/gosub_html5/src/tokenizer.rs @@ -12,8 +12,8 @@ use crate::errors::Error; use crate::node::HTML_NAMESPACE; use crate::tokenizer::state::State; use crate::tokenizer::token::Token; -use crate::tokenizer::Bytes::{Ch, Eof}; -use gosub_shared::bytes::{Bytes, CharIterator, Position}; +use gosub_shared::byte_stream::Character::{Ch, StreamEnd}; +use gosub_shared::byte_stream::{ByteStream, Character, Location, LocationHandler, Stream}; use gosub_shared::types::Result; use std::cell::{Ref, RefCell}; use std::collections::HashMap; @@ -31,7 +31,9 @@ pub const CHAR_REPLACEMENT: char = '\u{FFFD}'; /// The tokenizer will read the input stream and emit tokens that can be used by the parser. pub struct Tokenizer<'stream> { /// HTML character input stream - pub chars: &'stream mut CharIterator, + pub stream: &'stream mut ByteStream, + /// Current location in the stream + location_handler: LocationHandler, /// Current state of the tokenizer pub state: State, /// Current consumed characters for current token @@ -50,6 +52,8 @@ pub struct Tokenizer<'stream> { pub token_queue: Vec, /// The last emitted start token (or empty if none) pub last_start_token: String, + /// Last read character + pub last_char: Character, /// Error logger to log errors to pub error_logger: Rc>, } @@ -98,8 +102,7 @@ impl Default for Options { /// Convert a character to lower case value (assumes character is in A-Z range) macro_rules! to_lowercase { ($c:expr) => { - // Converts A-Z to a-z - ((($c) as u8) + 0x20) as char + $c.to_lowercase().next().unwrap() }; } @@ -107,12 +110,13 @@ impl<'stream> Tokenizer<'stream> { /// Creates a new tokenizer with the given inputstream and additional options if any #[must_use] pub fn new( - chars: &'stream mut CharIterator, + stream: &'stream mut ByteStream, opts: Option, error_logger: Rc>, ) -> Self { return Self { - chars, + stream, + location_handler: LocationHandler::new(Location::default()), state: opts.as_ref().map_or(State::Data, |o| o.initial_state), last_start_token: opts.map_or(String::new(), |o| o.last_start_tag), consumed: String::new(), @@ -122,13 +126,15 @@ impl<'stream> Tokenizer<'stream> { current_attr_value: String::new(), current_attrs: HashMap::new(), temporary_buffer: String::new(), + last_char: StreamEnd, error_logger, }; } - /// Returns the current position in the stream (with line/col number and position) - pub(crate) fn get_position(&self) -> Position { - self.chars.position + /// Returns the current location in the stream (with line/col number and byte offset) + #[inline] + pub(crate) fn get_location(&self) -> Location { + self.location_handler.cur_location.clone() } /// Retrieves the next token from the input stream or Token::EOF when the end is reached @@ -162,21 +168,16 @@ impl<'stream> Tokenizer<'stream> { match self.state { State::Data => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('&') => self.state = State::CharacterReferenceInData, Ch('<') => self.state = State::TagOpen, Ch(CHAR_NUL) => { self.consume(c.into()); - self.parse_error(ParserError::UnexpectedNullCharacter); - } - Eof => { - // if self.has_consumed_data() { - // self.emit_token(Token::TextToken { value: self.get_consumed_str() }); - // self.clear_consume_buffer(); - // } - self.emit_token(Token::Eof); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); } + StreamEnd => self.emit_token(Token::Eof), _ => self.consume(c.into()), } } @@ -185,20 +186,15 @@ impl<'stream> Tokenizer<'stream> { self.state = State::Data; } State::RCDATA => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('&') => self.state = State::CharacterReferenceInRcData, Ch('<') => self.state = State::RCDATALessThanSign, - Eof => { - // if self.has_consumed_data() { - // self.emit_token(Token::TextToken { value: self.get_consumed_str().clone() }); - // self.clear_consume_buffer(); - // } - self.emit_token(Token::Eof); - } + StreamEnd => self.emit_token(Token::Eof), Ch(CHAR_NUL) => { self.consume(CHAR_REPLACEMENT); - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); } _ => self.consume(c.into()), } @@ -209,60 +205,45 @@ impl<'stream> Tokenizer<'stream> { self.state = State::RCDATA; } State::RAWTEXT => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('<') => self.state = State::RAWTEXTLessThanSign, Ch(CHAR_NUL) => { self.consume(CHAR_REPLACEMENT); - self.parse_error(ParserError::UnexpectedNullCharacter); - } - Eof => { - // EOF - // if self.has_consumed_data() { - // self.emit_token(Token::TextToken { value: self.get_consumed_str() }); - // self.clear_consume_buffer(); - // } - self.emit_token(Token::Eof); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); } + StreamEnd => self.emit_token(Token::Eof), _ => self.consume(c.into()), } } State::ScriptData => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('<') => self.state = State::ScriptDataLessThenSign, Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.consume(CHAR_REPLACEMENT); } - Eof => { - // if self.has_consumed_data() { - // self.emit_token(Token::TextToken { value: self.get_consumed_str().clone() }); - // self.clear_consume_buffer(); - // } - self.emit_token(Token::Eof); - } + StreamEnd => self.emit_token(Token::Eof), _ => self.consume(c.into()), } } State::PLAINTEXT => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.consume(CHAR_REPLACEMENT); } - Eof => { - // if self.has_consumed_data() { - // self.emit_token(Token::TextToken { value: self.get_consumed_str().clone() }); - // self.clear_consume_buffer(); - // } - self.emit_token(Token::Eof); - } + StreamEnd => self.emit_token(Token::Eof), _ => self.consume(c.into()), } } State::TagOpen => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('!') => self.state = State::MarkupDeclarationOpen, @@ -273,29 +254,33 @@ impl<'stream> Tokenizer<'stream> { is_self_closing: false, attributes: HashMap::new(), }); - self.chars.unread(); + self.stream_prev(); self.state = State::TagName; } Ch('?') => { self.current_token = Some(Token::Comment(String::new())); - self.parse_error(ParserError::UnexpectedQuestionMarkInsteadOfTagName); - self.chars.unread(); + self.parse_error( + ParserError::UnexpectedQuestionMarkInsteadOfTagName, + loc, + ); + self.stream_prev(); self.state = State::BogusComment; } - Eof => { - self.parse_error(ParserError::EofBeforeTagName); + StreamEnd => { + self.parse_error(ParserError::EofBeforeTagName, loc); self.consume('<'); self.state = State::Data; } _ => { - self.parse_error(ParserError::InvalidFirstCharacterOfTagName); + self.parse_error(ParserError::InvalidFirstCharacterOfTagName, loc); self.consume('<'); - self.chars.unread(); + self.stream_prev(); self.state = State::Data; } } } State::EndTagOpen => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(ch) if ch.is_ascii_alphabetic() => { @@ -303,28 +288,29 @@ impl<'stream> Tokenizer<'stream> { name: String::new(), is_self_closing: false, }); - self.chars.unread(); + self.stream_prev(); self.state = State::TagName; } Ch('>') => { - self.parse_error(ParserError::MissingEndTagName); + self.parse_error(ParserError::MissingEndTagName, loc); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofBeforeTagName); + StreamEnd => { + self.parse_error(ParserError::EofBeforeTagName, loc); self.consume('<'); self.consume('/'); self.state = State::Data; } _ => { - self.parse_error(ParserError::InvalidFirstCharacterOfTagName); + self.parse_error(ParserError::InvalidFirstCharacterOfTagName, loc); self.current_token = Some(Token::Comment(String::new())); - self.chars.unread(); + self.stream_prev(); self.state = State::BogusComment; } } } State::TagName => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -337,11 +323,11 @@ impl<'stream> Tokenizer<'stream> { } Ch(ch @ 'A'..='Z') => self.add_to_token_name(to_lowercase!(ch)), Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.add_to_token_name(CHAR_REPLACEMENT); } - Eof => { - self.parse_error(ParserError::EofInTag); + StreamEnd => { + self.parse_error(ParserError::EofInTag, loc); self.state = State::Data; } _ => self.add_to_token_name(c.into()), @@ -354,7 +340,7 @@ impl<'stream> Tokenizer<'stream> { self.state = State::RCDATAEndTagOpen; } else { self.consume('<'); - self.chars.unread(); + self.stream_prev(); self.state = State::RCDATA; } } @@ -366,13 +352,13 @@ impl<'stream> Tokenizer<'stream> { name: String::new(), is_self_closing: false, }); - self.chars.unread(); + self.stream_prev(); self.state = State::RCDATAEndTagName; } _ => { self.consume('<'); self.consume('/'); - self.chars.unread(); + self.stream_prev(); self.state = State::RCDATA; } } @@ -419,20 +405,17 @@ impl<'stream> Tokenizer<'stream> { consume_anything_else = true; } } - Ch(ch @ 'A'..='Z') => { + Ch(ch) if ch.is_ascii_alphabetic() => { self.add_to_token_name(to_lowercase!(ch)); self.temporary_buffer.push(ch); } - Ch(ch @ 'a'..='z') => { - self.add_to_token_name(ch); - self.temporary_buffer.push(ch); - } _ => { consume_anything_else = true; } } if consume_anything_else { + self.stream_prev(); self.transition_to(State::RCDATA); } } @@ -443,7 +426,7 @@ impl<'stream> Tokenizer<'stream> { self.state = State::RAWTEXTEndTagOpen; } else { self.consume('<'); - self.chars.unread(); + self.stream_prev(); self.state = State::RAWTEXT; } } @@ -455,13 +438,13 @@ impl<'stream> Tokenizer<'stream> { name: String::new(), is_self_closing: false, }); - self.chars.unread(); + self.stream_prev(); self.state = State::RAWTEXTEndTagName; } _ => { self.consume('<'); self.consume('/'); - self.chars.unread(); + self.stream_prev(); self.state = State::RAWTEXT; } } @@ -508,20 +491,17 @@ impl<'stream> Tokenizer<'stream> { consume_anything_else = true; } } - Ch(ch @ 'A'..='Z') => { + Ch(ch) if ch.is_ascii_alphabetic() => { self.add_to_token_name(to_lowercase!(ch)); self.temporary_buffer.push(ch); } - Ch(ch @ 'a'..='z') => { - self.add_to_token_name(ch); - self.temporary_buffer.push(ch); - } _ => { consume_anything_else = true; } } if consume_anything_else { + self.stream_prev(); self.transition_to(State::RAWTEXT); } } @@ -539,7 +519,7 @@ impl<'stream> Tokenizer<'stream> { } _ => { self.consume('<'); - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptData; } } @@ -549,16 +529,18 @@ impl<'stream> Tokenizer<'stream> { match c { Ch(ch) if ch.is_ascii_alphabetic() => { self.current_token = Some(Token::EndTag { - name: String::new(), + name: format!("{}", to_lowercase!(ch)), is_self_closing: false, }); - self.chars.unread(); + + self.temporary_buffer.push(ch); + self.state = State::ScriptDataEndTagName; } _ => { self.consume('<'); self.consume('/'); - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptData; } } @@ -605,20 +587,17 @@ impl<'stream> Tokenizer<'stream> { consume_anything_else = true; } } - Ch(ch @ 'A'..='Z') => { + Ch(ch) if ch.is_ascii_alphabetic() => { self.add_to_token_name(to_lowercase!(ch)); self.temporary_buffer.push(ch); } - Ch(ch @ 'a'..='z') => { - self.add_to_token_name(ch); - self.temporary_buffer.push(ch); - } _ => { consume_anything_else = true; } } if consume_anything_else { + self.stream_prev(); self.transition_to(State::ScriptData); } } @@ -628,7 +607,7 @@ impl<'stream> Tokenizer<'stream> { self.consume('-'); self.state = State::ScriptDataEscapeStartDash; } else { - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptData; } } @@ -638,11 +617,12 @@ impl<'stream> Tokenizer<'stream> { self.consume('-'); self.state = State::ScriptDataEscapedDashDash; } else { - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptData; } } State::ScriptDataEscaped => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('-') => { @@ -653,11 +633,11 @@ impl<'stream> Tokenizer<'stream> { self.state = State::ScriptDataEscapedLessThanSign; } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.consume(CHAR_REPLACEMENT); } - Eof => { - self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + StreamEnd => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText, loc); self.state = State::Data; } _ => { @@ -666,6 +646,7 @@ impl<'stream> Tokenizer<'stream> { } } State::ScriptDataEscapedDash => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('-') => { @@ -676,12 +657,12 @@ impl<'stream> Tokenizer<'stream> { self.state = State::ScriptDataEscapedLessThanSign; } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.consume(CHAR_REPLACEMENT); self.state = State::ScriptDataEscaped; } - Eof => { - self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + StreamEnd => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText, loc); self.state = State::Data; } _ => { @@ -691,6 +672,7 @@ impl<'stream> Tokenizer<'stream> { } } State::ScriptDataEscapedDashDash => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('-') => { @@ -704,12 +686,12 @@ impl<'stream> Tokenizer<'stream> { self.state = State::ScriptData; } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.consume(CHAR_REPLACEMENT); self.state = State::ScriptDataEscaped; } - Eof => { - self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + StreamEnd => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText, loc); self.state = State::Data; } _ => { @@ -728,20 +710,19 @@ impl<'stream> Tokenizer<'stream> { Ch(ch) if ch.is_ascii_alphabetic() => { self.temporary_buffer.clear(); self.consume('<'); - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptDataDoubleEscapeStart; } _ => { // anything else self.consume('<'); - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptDataEscaped; } } } State::ScriptDataEscapedEndTagOpen => { let c = self.read_char(); - match c { Ch(ch) if ch.is_ascii_alphabetic() => { self.current_token = Some(Token::EndTag { @@ -749,13 +730,13 @@ impl<'stream> Tokenizer<'stream> { is_self_closing: false, }); - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptDataEscapedEndTagName; } _ => { self.consume('<'); self.consume('/'); - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptDataEscaped; } } @@ -802,20 +783,17 @@ impl<'stream> Tokenizer<'stream> { consume_anything_else = true; } } - Ch(ch @ 'A'..='Z') => { + Ch(ch) if ch.is_ascii_alphabetic() => { self.add_to_token_name(to_lowercase!(ch)); self.temporary_buffer.push(ch); } - Ch(ch @ 'a'..='z') => { - self.add_to_token_name(ch); - self.temporary_buffer.push(ch); - } _ => { consume_anything_else = true; } } if consume_anything_else { + self.stream_prev(); self.transition_to(State::ScriptDataEscaped); } } @@ -830,21 +808,18 @@ impl<'stream> Tokenizer<'stream> { } self.consume(c.into()); } - Ch(ch @ 'A'..='Z') => { + Ch(ch) if ch.is_ascii_alphabetic() => { self.temporary_buffer.push(to_lowercase!(ch)); self.consume(ch); } - Ch(ch @ 'a'..='z') => { - self.temporary_buffer.push(ch); - self.consume(ch); - } _ => { - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptDataEscaped; } } } State::ScriptDataDoubleEscaped => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('-') => { @@ -856,17 +831,18 @@ impl<'stream> Tokenizer<'stream> { self.state = State::ScriptDataDoubleEscapedLessThanSign; } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.consume(CHAR_REPLACEMENT); } - Eof => { - self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + StreamEnd => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText, loc); self.state = State::Data; } _ => self.consume(c.into()), } } State::ScriptDataDoubleEscapedDash => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('-') => { @@ -878,12 +854,12 @@ impl<'stream> Tokenizer<'stream> { self.consume('<'); } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.consume(CHAR_REPLACEMENT); self.state = State::ScriptDataDoubleEscaped; } - Eof => { - self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + StreamEnd => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText, loc); self.state = State::Data; } _ => { @@ -893,6 +869,7 @@ impl<'stream> Tokenizer<'stream> { } } State::ScriptDataDoubleEscapedDashDash => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('-') => self.consume('-'), @@ -905,12 +882,12 @@ impl<'stream> Tokenizer<'stream> { self.state = State::ScriptData; } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.consume(CHAR_REPLACEMENT); self.state = State::ScriptDataDoubleEscaped; } - Eof => { - self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + StreamEnd => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText, loc); self.state = State::Data; } _ => { @@ -926,7 +903,7 @@ impl<'stream> Tokenizer<'stream> { self.consume('/'); self.state = State::ScriptDataDoubleEscapeEnd; } else { - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptDataDoubleEscaped; } } @@ -941,32 +918,32 @@ impl<'stream> Tokenizer<'stream> { } self.consume(c.into()); } - Ch(ch @ 'A'..='Z') => { + Ch(ch) if ch.is_ascii_alphabetic() => { self.temporary_buffer.push(to_lowercase!(ch)); self.consume(ch); } - Ch(ch @ 'a'..='z') => { - self.temporary_buffer.push(ch); - self.consume(ch); - } _ => { - self.chars.unread(); + self.stream_prev(); self.state = State::ScriptDataDoubleEscaped; } } } State::BeforeAttributeName => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { // Ignore character } - Ch('/' | '>') | Eof => { - self.chars.unread(); + Ch('/' | '>') | StreamEnd => { + self.stream_prev(); self.state = State::AfterAttributeName; } Ch('=') => { - self.parse_error(ParserError::UnexpectedEqualsSignBeforeAttributeName); + self.parse_error( + ParserError::UnexpectedEqualsSignBeforeAttributeName, + loc, + ); self.store_and_clear_current_attribute(); self.current_attr_name.push(c.into()); @@ -977,25 +954,32 @@ impl<'stream> Tokenizer<'stream> { // Store an existing attribute if any and clear self.store_and_clear_current_attribute(); - self.chars.unread(); + self.stream_prev(); self.state = State::AttributeName; } } } State::AttributeName => { + let loc = self.get_location(); let c = self.read_char(); match c { - Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE | '/' | '>') | Eof => { + Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE | '/' | '>') => { if self.attr_already_exists() { - self.parse_error(ParserError::DuplicateAttribute); + self.parse_error(ParserError::DuplicateAttribute, loc); } - self.chars.unread(); + self.stream_prev(); self.state = State::AfterAttributeName; } + StreamEnd => { + if self.attr_already_exists() { + self.parse_error(ParserError::DuplicateAttribute, loc); + } + self.state = State::AfterAttributeName; + } Ch('=') => { if self.attr_already_exists() { - self.parse_error(ParserError::DuplicateAttribute); + self.parse_error(ParserError::DuplicateAttribute, loc); } self.state = State::BeforeAttributeValue; } @@ -1003,17 +987,18 @@ impl<'stream> Tokenizer<'stream> { self.current_attr_name.push(to_lowercase!(ch)); } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.current_attr_name.push(CHAR_REPLACEMENT); } Ch('"' | '\'' | '<') => { - self.parse_error(ParserError::UnexpectedCharacterInAttributeName); + self.parse_error(ParserError::UnexpectedCharacterInAttributeName, loc); self.current_attr_name.push(c.into()); } _ => self.current_attr_name.push(c.into()), } } State::AfterAttributeName => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1027,18 +1012,19 @@ impl<'stream> Tokenizer<'stream> { self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInTag); + StreamEnd => { + self.parse_error(ParserError::EofInTag, loc); self.state = State::Data; } _ => { self.store_and_clear_current_attribute(); - self.chars.unread(); + self.stream_prev(); self.state = State::AttributeName; } } } State::BeforeAttributeValue => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1049,20 +1035,25 @@ impl<'stream> Tokenizer<'stream> { self.state = State::AttributeValueSingleQuoted; } Ch('>') => { - self.parse_error(ParserError::MissingAttributeValue); + self.parse_error(ParserError::MissingAttributeValue, loc); self.store_and_clear_current_attribute(); self.add_stored_attributes_to_current_token(); self.emit_current_token(); self.state = State::Data; } + StreamEnd => { + self.parse_error(ParserError::EofInTag, loc); + self.state = State::Data; + } _ => { - self.chars.unread(); + self.stream_prev(); self.state = State::AttributeValueUnquoted; } } } State::AttributeValueDoubleQuoted => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('"') => self.state = State::AfterAttributeValueQuoted, @@ -1070,11 +1061,11 @@ impl<'stream> Tokenizer<'stream> { self.consume_character_reference(Some(Ch('"')), true); } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.current_attr_value.push(CHAR_REPLACEMENT); } - Eof => { - self.parse_error(ParserError::EofInTag); + StreamEnd => { + self.parse_error(ParserError::EofInTag, loc); self.state = State::Data; } _ => { @@ -1083,6 +1074,7 @@ impl<'stream> Tokenizer<'stream> { } } State::AttributeValueSingleQuoted => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('\'') => self.state = State::AfterAttributeValueQuoted, @@ -1090,11 +1082,11 @@ impl<'stream> Tokenizer<'stream> { self.consume_character_reference(Some(Ch('\'')), true); } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.current_attr_value.push(CHAR_REPLACEMENT); } - Eof => { - self.parse_error(ParserError::EofInTag); + StreamEnd => { + self.parse_error(ParserError::EofInTag, loc); self.state = State::Data; } _ => { @@ -1103,6 +1095,7 @@ impl<'stream> Tokenizer<'stream> { } } State::AttributeValueUnquoted => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1118,17 +1111,18 @@ impl<'stream> Tokenizer<'stream> { self.state = State::Data; } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.current_attr_value.push(CHAR_REPLACEMENT); } Ch('"' | '\'' | '<' | '=' | '`') => { self.parse_error( ParserError::UnexpectedCharacterInUnquotedAttributeValue, + loc, ); self.current_attr_value.push(c.into()); } - Eof => { - self.parse_error(ParserError::EofInTag); + StreamEnd => { + self.parse_error(ParserError::EofInTag, loc); self.state = State::Data; } _ => { @@ -1138,6 +1132,7 @@ impl<'stream> Tokenizer<'stream> { } // State::CharacterReferenceInAttributeValue => {} State::AfterAttributeValueQuoted => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1150,47 +1145,51 @@ impl<'stream> Tokenizer<'stream> { self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInTag); + StreamEnd => { + self.parse_error(ParserError::EofInTag, loc); self.state = State::Data; } _ => { - self.parse_error(ParserError::MissingWhitespaceBetweenAttributes); - self.chars.unread(); + self.parse_error(ParserError::MissingWhitespaceBetweenAttributes, loc); + self.stream_prev(); self.state = State::BeforeAttributeName; } } } State::SelfClosingStart => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('>') => { self.set_is_closing_in_current_token(true); + + self.state = State::Data; + self.store_and_clear_current_attribute(); self.add_stored_attributes_to_current_token(); self.emit_current_token(); - self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInTag); + StreamEnd => { + self.parse_error(ParserError::EofInTag, loc); self.state = State::Data; } _ => { - self.parse_error(ParserError::UnexpectedSolidusInTag); - self.chars.unread(); + self.stream_prev(); + self.parse_error(ParserError::UnexpectedSolidusInTag, loc); self.state = State::BeforeAttributeName; } } } State::BogusComment => { + let loc = self.get_location(); let c = self.read_char(); match c { - Ch('>') | Eof => { + Ch('>') | StreamEnd => { self.emit_current_token(); self.state = State::Data; } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.add_to_token_value(CHAR_REPLACEMENT); } _ => { @@ -1199,85 +1198,90 @@ impl<'stream> Tokenizer<'stream> { } } State::MarkupDeclarationOpen => { - if self.chars.look_ahead_slice(2) == "--" { + if Character::slice_to_string(self.stream.get_slice(2)) == "--" { self.current_token = Some(Token::Comment(String::new())); // Skip the two -- signs - self.chars.skip(2); + self.stream_next_n(2); self.state = State::CommentStart; continue; } - if self.chars.look_ahead_slice(7).to_uppercase() == "DOCTYPE" { - self.chars.skip(7); + if Character::slice_to_string(self.stream.get_slice(7)).to_uppercase() + == "DOCTYPE" + { + self.stream_next_n(7); self.state = State::DOCTYPE; continue; } - if self.chars.look_ahead_slice(7) == "[CDATA[" { - self.chars.skip(7); + if Character::slice_to_string(self.stream.get_slice(7)) == "[CDATA[" { + self.stream_next_n(6); + let loc = self.get_location(); + self.stream_next_n(1); if parser_data.adjusted_node_namespace != HTML_NAMESPACE { self.state = State::CDATASection; continue; } - self.parse_error(ParserError::CdataInHtmlContent); + self.parse_error(ParserError::CdataInHtmlContent, loc); self.current_token = Some(Token::Comment("[CDATA[".into())); self.state = State::BogusComment; continue; } - self.chars.read_char(); - self.parse_error(ParserError::IncorrectlyOpenedComment); - self.chars.unread(); + self.parse_error(ParserError::IncorrectlyOpenedComment, self.get_location()); self.current_token = Some(Token::Comment(String::new())); self.state = State::BogusComment; } State::CommentStart => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('-') => { self.state = State::CommentStartDash; } Ch('>') => { - self.parse_error(ParserError::AbruptClosingOfEmptyComment); + self.parse_error(ParserError::AbruptClosingOfEmptyComment, loc); self.emit_current_token(); self.state = State::Data; } _ => { - self.chars.unread(); + self.stream_prev(); self.state = State::Comment; } } } State::CommentStartDash => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('-') => { self.state = State::CommentEnd; } Ch('>') => { - self.parse_error(ParserError::AbruptClosingOfEmptyComment); + self.parse_error(ParserError::AbruptClosingOfEmptyComment, loc); self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInComment); + StreamEnd => { + self.parse_error(ParserError::EofInComment, loc); self.emit_current_token(); self.state = State::Data; } _ => { self.add_to_token_value('-'); - self.chars.unread(); + self.stream_prev(); self.state = State::Comment; } } } State::Comment => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('<') => { @@ -1286,11 +1290,11 @@ impl<'stream> Tokenizer<'stream> { } Ch('-') => self.state = State::CommentEndDash, Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.add_to_token_value(CHAR_REPLACEMENT); } - Eof => { - self.parse_error(ParserError::EofInComment); + StreamEnd => { + self.parse_error(ParserError::EofInComment, loc); self.emit_current_token(); self.state = State::Data; } @@ -1310,7 +1314,7 @@ impl<'stream> Tokenizer<'stream> { self.add_to_token_value(c.into()); } _ => { - self.chars.unread(); + self.stream_prev(); self.state = State::Comment; } } @@ -1320,7 +1324,7 @@ impl<'stream> Tokenizer<'stream> { if let Ch('-') = c { self.state = State::CommentLessThanSignBangDash; } else { - self.chars.unread(); + self.stream_prev(); self.state = State::Comment; } } @@ -1329,40 +1333,49 @@ impl<'stream> Tokenizer<'stream> { if let Ch('-') = c { self.state = State::CommentLessThanSignBangDashDash; } else { - self.chars.unread(); + self.stream_prev(); self.state = State::CommentEndDash; } } State::CommentLessThanSignBangDashDash => { + let loc = self.get_location(); let c = self.read_char(); - if let Eof | Ch('>') = c { - self.chars.unread(); - self.state = State::CommentEnd; - } else { - self.parse_error(ParserError::NestedComment); - self.chars.unread(); - self.state = State::CommentEnd; + match c { + StreamEnd => { + self.state = State::CommentEnd; + } + Ch('>') => { + self.stream_prev(); + self.state = State::CommentEnd; + } + _ => { + self.parse_error(ParserError::NestedComment, loc); + self.stream_prev(); + self.state = State::CommentEnd; + } } } State::CommentEndDash => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('-') => { self.state = State::CommentEnd; } - Eof => { - self.parse_error(ParserError::EofInComment); + StreamEnd => { + self.parse_error(ParserError::EofInComment, loc); self.emit_current_token(); self.state = State::Data; } _ => { self.add_to_token_value('-'); - self.chars.unread(); + self.stream_prev(); self.state = State::Comment; } } } State::CommentEnd => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('>') => { @@ -1371,20 +1384,21 @@ impl<'stream> Tokenizer<'stream> { } Ch('!') => self.state = State::CommentEndBang, Ch('-') => self.add_to_token_value('-'), - Eof => { - self.parse_error(ParserError::EofInComment); + StreamEnd => { + self.parse_error(ParserError::EofInComment, loc); self.emit_current_token(); self.state = State::Data; } _ => { self.add_to_token_value('-'); self.add_to_token_value('-'); - self.chars.unread(); + self.stream_prev(); self.state = State::Comment; } } } State::CommentEndBang => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('-') => { @@ -1395,12 +1409,12 @@ impl<'stream> Tokenizer<'stream> { self.state = State::CommentEndDash; } Ch('>') => { - self.parse_error(ParserError::IncorrectlyClosedComment); + self.parse_error(ParserError::IncorrectlyClosedComment, loc); self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInComment); + StreamEnd => { + self.parse_error(ParserError::EofInComment, loc); self.emit_current_token(); self.state = State::Data; } @@ -1408,23 +1422,24 @@ impl<'stream> Tokenizer<'stream> { self.add_to_token_value('-'); self.add_to_token_value('-'); self.add_to_token_value('!'); - self.chars.unread(); + self.stream_prev(); self.state = State::Comment; } } } State::DOCTYPE => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { self.state = State::BeforeDOCTYPEName; } Ch('>') => { - self.chars.unread(); + self.stream_prev(); self.state = State::BeforeDOCTYPEName; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.emit_token(Token::DocType { name: None, @@ -1436,13 +1451,14 @@ impl<'stream> Tokenizer<'stream> { self.state = State::Data; } _ => { - self.parse_error(ParserError::MissingWhitespaceBeforeDoctypeName); - self.chars.unread(); + self.parse_error(ParserError::MissingWhitespaceBeforeDoctypeName, loc); + self.stream_prev(); self.state = State::BeforeDOCTYPEName; } } } State::BeforeDOCTYPEName => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1460,7 +1476,7 @@ impl<'stream> Tokenizer<'stream> { self.state = State::DOCTYPEName; } Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.current_token = Some(Token::DocType { name: None, force_quirks: false, @@ -1472,7 +1488,7 @@ impl<'stream> Tokenizer<'stream> { self.state = State::DOCTYPEName; } Ch('>') => { - self.parse_error(ParserError::MissingDoctypeName); + self.parse_error(ParserError::MissingDoctypeName, loc); self.emit_token(Token::DocType { name: None, force_quirks: true, @@ -1483,8 +1499,8 @@ impl<'stream> Tokenizer<'stream> { self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.emit_token(Token::DocType { name: None, @@ -1509,6 +1525,7 @@ impl<'stream> Tokenizer<'stream> { } } State::DOCTYPEName => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1520,11 +1537,11 @@ impl<'stream> Tokenizer<'stream> { } Ch(ch @ 'A'..='Z') => self.add_to_token_name(to_lowercase!(ch)), Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.add_to_token_name(CHAR_REPLACEMENT); } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1533,6 +1550,7 @@ impl<'stream> Tokenizer<'stream> { } } State::AfterDOCTYPEName => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1542,36 +1560,45 @@ impl<'stream> Tokenizer<'stream> { self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; } _ => { - self.chars.unread(); - if self.chars.look_ahead_slice(6).to_uppercase() == "PUBLIC" { - self.chars.skip(6); + self.stream_prev(); + if Character::slice_to_string(self.stream.get_slice(6)).to_uppercase() + == "PUBLIC" + { + self.stream_next_n(6); self.state = State::AfterDOCTYPEPublicKeyword; continue; } - if self.chars.look_ahead_slice(6).to_uppercase() == "SYSTEM" { - self.chars.skip(6); + if Character::slice_to_string(self.stream.get_slice(6)).to_uppercase() + == "SYSTEM" + { + self.stream_next_n(6); self.state = State::AfterDOCTYPESystemKeyword; continue; } // Make sure the parser is on the correct position again since we just // unread the character - self.chars.skip(1); - self.parse_error(ParserError::InvalidCharacterSequenceAfterDoctypeName); - self.chars.unread(); + let loc = self.get_location(); + self.stream_next_n(1); + self.parse_error( + ParserError::InvalidCharacterSequenceAfterDoctypeName, + loc, + ); + self.stream_prev(); self.set_quirks_mode(true); - self.chars.unread(); + self.stream_prev(); self.state = State::BogusDOCTYPE; } } } State::AfterDOCTYPEPublicKeyword => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1580,6 +1607,7 @@ impl<'stream> Tokenizer<'stream> { Ch('"') => { self.parse_error( ParserError::MissingWhitespaceAfterDoctypePublicKeyword, + loc, ); self.set_public_identifier(String::new()); self.state = State::DOCTYPEPublicIdentifierDoubleQuoted; @@ -1587,18 +1615,19 @@ impl<'stream> Tokenizer<'stream> { Ch('\'') => { self.parse_error( ParserError::MissingWhitespaceAfterDoctypePublicKeyword, + loc, ); self.set_public_identifier(String::new()); self.state = State::DOCTYPEPublicIdentifierSingleQuoted; } Ch('>') => { - self.parse_error(ParserError::MissingDoctypePublicIdentifier); + self.parse_error(ParserError::MissingDoctypePublicIdentifier, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1606,14 +1635,16 @@ impl<'stream> Tokenizer<'stream> { _ => { self.parse_error( ParserError::MissingQuoteBeforeDoctypePublicIdentifier, + loc, ); - self.chars.unread(); + self.stream_prev(); self.set_quirks_mode(true); self.state = State::BogusDOCTYPE; } } } State::BeforeDOCTYPEPublicIdentifier => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1628,21 +1659,22 @@ impl<'stream> Tokenizer<'stream> { self.state = State::DOCTYPEPublicIdentifierSingleQuoted; } Ch('>') => { - self.parse_error(ParserError::MissingDoctypePublicIdentifier); + self.parse_error(ParserError::MissingDoctypePublicIdentifier, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; } _ => { - self.chars.unread(); + self.stream_prev(); self.parse_error( ParserError::MissingQuoteBeforeDoctypePublicIdentifier, + loc, ); self.set_quirks_mode(true); self.state = State::BogusDOCTYPE; @@ -1650,21 +1682,22 @@ impl<'stream> Tokenizer<'stream> { } } State::DOCTYPEPublicIdentifierDoubleQuoted => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('"') => self.state = State::AfterDOCTYPEPublicIdentifier, Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.add_public_identifier(CHAR_REPLACEMENT); } Ch('>') => { - self.parse_error(ParserError::AbruptDoctypePublicIdentifier); + self.parse_error(ParserError::AbruptDoctypePublicIdentifier, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1673,21 +1706,22 @@ impl<'stream> Tokenizer<'stream> { } } State::DOCTYPEPublicIdentifierSingleQuoted => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('\'') => self.state = State::AfterDOCTYPEPublicIdentifier, Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.add_public_identifier(CHAR_REPLACEMENT); } Ch('>') => { - self.parse_error(ParserError::AbruptDoctypePublicIdentifier); + self.parse_error(ParserError::AbruptDoctypePublicIdentifier, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1696,6 +1730,7 @@ impl<'stream> Tokenizer<'stream> { } } State::AfterDOCTYPEPublicIdentifier => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1706,17 +1741,17 @@ impl<'stream> Tokenizer<'stream> { self.state = State::Data; } Ch('"') => { - self.parse_error(ParserError::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); + self.parse_error(ParserError::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers, loc); self.set_system_identifier(String::new()); self.state = State::DOCTYPESystemIdentifierDoubleQuoted; } Ch('\'') => { - self.parse_error(ParserError::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); + self.parse_error(ParserError::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers, loc); self.set_system_identifier(String::new()); self.state = State::DOCTYPESystemIdentifierSingleQuoted; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1724,14 +1759,16 @@ impl<'stream> Tokenizer<'stream> { _ => { self.parse_error( ParserError::MissingQuoteBeforeDoctypeSystemIdentifier, + loc, ); - self.chars.unread(); + self.stream_prev(); self.set_quirks_mode(true); self.state = State::BogusDOCTYPE; } } } State::BetweenDOCTYPEPublicAndSystemIdentifiers => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1749,8 +1786,8 @@ impl<'stream> Tokenizer<'stream> { self.set_system_identifier(String::new()); self.state = State::DOCTYPESystemIdentifierSingleQuoted; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1758,14 +1795,16 @@ impl<'stream> Tokenizer<'stream> { _ => { self.parse_error( ParserError::MissingQuoteBeforeDoctypeSystemIdentifier, + loc, ); - self.chars.unread(); + self.stream_prev(); self.set_quirks_mode(true); self.state = State::BogusDOCTYPE; } } } State::AfterDOCTYPESystemKeyword => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1774,6 +1813,7 @@ impl<'stream> Tokenizer<'stream> { Ch('"') => { self.parse_error( ParserError::MissingWhitespaceAfterDoctypeSystemKeyword, + loc, ); self.set_system_identifier(String::new()); self.state = State::DOCTYPESystemIdentifierDoubleQuoted; @@ -1781,18 +1821,19 @@ impl<'stream> Tokenizer<'stream> { Ch('\'') => { self.parse_error( ParserError::MissingWhitespaceAfterDoctypeSystemKeyword, + loc, ); self.set_system_identifier(String::new()); self.state = State::DOCTYPESystemIdentifierSingleQuoted; } Ch('>') => { - self.parse_error(ParserError::MissingDoctypeSystemIdentifier); + self.parse_error(ParserError::MissingDoctypeSystemIdentifier, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1800,14 +1841,16 @@ impl<'stream> Tokenizer<'stream> { _ => { self.parse_error( ParserError::MissingQuoteBeforeDoctypeSystemIdentifier, + loc, ); - self.chars.unread(); + self.stream_prev(); self.set_quirks_mode(true); self.state = State::BogusDOCTYPE; } } } State::BeforeDOCTYPESystemIdentifier => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1822,13 +1865,13 @@ impl<'stream> Tokenizer<'stream> { self.state = State::DOCTYPESystemIdentifierSingleQuoted; } Ch('>') => { - self.parse_error(ParserError::MissingDoctypeSystemIdentifier); + self.parse_error(ParserError::MissingDoctypeSystemIdentifier, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1836,29 +1879,31 @@ impl<'stream> Tokenizer<'stream> { _ => { self.parse_error( ParserError::MissingQuoteBeforeDoctypeSystemIdentifier, + loc, ); - self.chars.unread(); + self.stream_prev(); self.set_quirks_mode(true); self.state = State::BogusDOCTYPE; } } } State::DOCTYPESystemIdentifierDoubleQuoted => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('"') => self.state = State::AfterDOCTYPESystemIdentifier, Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.add_system_identifier(CHAR_REPLACEMENT); } Ch('>') => { - self.parse_error(ParserError::AbruptDoctypeSystemIdentifier); + self.parse_error(ParserError::AbruptDoctypeSystemIdentifier, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1867,21 +1912,22 @@ impl<'stream> Tokenizer<'stream> { } } State::DOCTYPESystemIdentifierSingleQuoted => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch('\'') => self.state = State::AfterDOCTYPESystemIdentifier, Ch(CHAR_NUL) => { - self.parse_error(ParserError::UnexpectedNullCharacter); + self.parse_error(ParserError::UnexpectedNullCharacter, loc); self.add_system_identifier(CHAR_REPLACEMENT); } Ch('>') => { - self.parse_error(ParserError::AbruptDoctypeSystemIdentifier); + self.parse_error(ParserError::AbruptDoctypeSystemIdentifier, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1890,6 +1936,7 @@ impl<'stream> Tokenizer<'stream> { } } State::AfterDOCTYPESystemIdentifier => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(CHAR_TAB | CHAR_LF | CHAR_FF | CHAR_SPACE) => { @@ -1899,8 +1946,8 @@ impl<'stream> Tokenizer<'stream> { self.emit_current_token(); self.state = State::Data; } - Eof => { - self.parse_error(ParserError::EofInDoctype); + StreamEnd => { + self.parse_error(ParserError::EofInDoctype, loc); self.set_quirks_mode(true); self.emit_current_token(); self.state = State::Data; @@ -1908,33 +1955,36 @@ impl<'stream> Tokenizer<'stream> { _ => { self.parse_error( ParserError::UnexpectedCharacterAfterDoctypeSystemIdentifier, + loc, ); - self.chars.unread(); + self.stream_prev(); self.state = State::BogusDOCTYPE; } } } State::BogusDOCTYPE => { + let loc = self.get_location(); let c = self.read_char(); match c { - Ch('>') | Eof => { + Ch('>') | StreamEnd => { self.emit_current_token(); self.state = State::Data; } - Ch(CHAR_NUL) => self.parse_error(ParserError::UnexpectedNullCharacter), + Ch(CHAR_NUL) => self.parse_error(ParserError::UnexpectedNullCharacter, loc), _ => { // ignore } } } State::CDATASection => { + let loc = self.get_location(); let c = self.read_char(); match c { Ch(']') => { self.state = State::CDATASectionBracket; } - Eof => { - self.parse_error(ParserError::EofInCdata); + StreamEnd => { + self.parse_error(ParserError::EofInCdata, loc); self.state = State::Data; } _ => self.consume(c.into()), @@ -1946,19 +1996,24 @@ impl<'stream> Tokenizer<'stream> { self.state = State::CDATASectionEnd; } else { self.consume(']'); - self.chars.unread(); + self.stream_prev(); self.state = State::CDATASection; } } State::CDATASectionEnd => { let c = self.read_char(); match c { - Ch(']') => self.consume(']'), Ch('>') => self.state = State::Data, + Ch(']') => self.consume(']'), + StreamEnd => { + self.consume(']'); + self.consume(']'); + self.state = State::CDATASection; + } _ => { self.consume(']'); self.consume(']'); - self.chars.unread(); + self.consume(c.into()); self.state = State::CDATASection; } } @@ -1972,22 +2027,26 @@ impl<'stream> Tokenizer<'stream> { /// This macro reads a character from the input stream and optionally generates (tokenization) /// errors if the character is not valid. - fn read_char(&mut self) -> Bytes { - let mut c = self.chars.read_char(); + fn read_char(&mut self) -> Character { + let loc = self.get_location(); + let mut c = self.stream_read_and_next(); + match c { - Bytes::Surrogate(..) => { - self.parse_error(ParserError::SurrogateInInputStream); + Character::Surrogate(..) => { + self.parse_error(ParserError::SurrogateInInputStream, loc); c = Ch(CHAR_REPLACEMENT); } Ch(c) if self.is_control_char(c as u32) => { - self.parse_error(ParserError::ControlCharacterInInputStream); + self.parse_error(ParserError::ControlCharacterInInputStream, loc); } Ch(c) if self.is_noncharacter(c as u32) => { - self.parse_error(ParserError::NoncharacterInInputStream); + self.parse_error(ParserError::NoncharacterInInputStream, loc); } _ => {} } + // println!("stream_read(): {:?}", c); + c } @@ -2088,7 +2147,7 @@ impl<'stream> Tokenizer<'stream> { self.consumed.push_str(" Tokenizer<'stream> { } /// Creates a parser log error message - pub(crate) fn parse_error(&mut self, message: ParserError) { - // The previous position is where the error occurred - let pos = self.chars.get_previous_position(); - + pub(crate) fn parse_error(&mut self, message: ParserError, loc: Location) { self.error_logger .borrow_mut() - .add_error(pos, message.as_str()); + .add_error(loc, message.as_str()); } /// Set is_closing_tag in current token fn set_is_closing_in_current_token(&mut self, is_closing: bool) { match &mut self.current_token.as_mut().unwrap() { Token::EndTag { .. } => { - self.parse_error(ParserError::EndTagWithTrailingSolidus); + self.stream_prev(); + self.parse_error(ParserError::EndTagWithTrailingSolidus, self.get_location()); + self.stream_next_n(1); } Token::StartTag { is_self_closing, .. @@ -2211,7 +2269,10 @@ impl<'stream> Tokenizer<'stream> { match self.current_token.as_mut().expect("current token") { Token::EndTag { .. } => { - self.parse_error(ParserError::EndTagWithAttributes); + // Error is one char before this one. Unread, fetch location and read again + self.stream_prev(); + self.parse_error(ParserError::EndTagWithAttributes, self.get_location()); + self.stream_next_n(1); } Token::StartTag { attributes, .. } => { for (key, value) in &self.current_attrs { @@ -2222,4 +2283,26 @@ impl<'stream> Tokenizer<'stream> { _ => {} } } + + fn stream_read_and_next(&mut self) -> Character { + let c = self.stream.read_and_next(); + self.last_char = c; + self.location_handler.inc(c); + c + } + + fn stream_prev(&mut self) { + if self.last_char == StreamEnd { + return; + } + + self.location_handler.dec(); + self.stream.prev(); + } + + fn stream_next_n(&mut self, n: usize) { + for _ in 0..n { + self.stream_read_and_next(); + } + } } diff --git a/crates/gosub_html5/src/tokenizer/character_reference.rs b/crates/gosub_html5/src/tokenizer/character_reference.rs index 088995c3b..0fe2b1b55 100644 --- a/crates/gosub_html5/src/tokenizer/character_reference.rs +++ b/crates/gosub_html5/src/tokenizer/character_reference.rs @@ -3,7 +3,8 @@ extern crate lazy_static; use crate::error_logger::ParserError; use crate::tokenizer::replacement_tables::{TOKEN_NAMED_CHARS, TOKEN_REPLACEMENTS}; use crate::tokenizer::{Tokenizer, CHAR_REPLACEMENT}; -use gosub_shared::bytes::Bytes::{self, Ch}; +use gosub_shared::byte_stream::Character::Ch; +use gosub_shared::byte_stream::{Character, Stream}; use lazy_static::lazy_static; /// Different states for the character references @@ -26,7 +27,7 @@ impl Tokenizer<'_> { /// @TODO: fix additional allowed char pub fn consume_character_reference( &mut self, - _additional_allowed_char: Option, + _additional_allowed_char: Option, as_attribute: bool, ) { let mut ccr_state = CcrState::CharacterReference; @@ -38,28 +39,31 @@ impl Tokenizer<'_> { self.temporary_buffer.clear(); self.temporary_buffer.push('&'); - let c = self.read_char(); + let c = self.stream_read_and_next(); match c { Ch(ch) if ch.is_ascii_alphanumeric() => { - self.chars.unread(); + self.stream_prev(); ccr_state = CcrState::NamedCharacterReference; } Ch(c @ '#') => { self.temporary_buffer.push(c); ccr_state = CcrState::NumericCharacterReference; } + Character::StreamEnd => { + self.consume_temp_buffer(as_attribute); + return; + } _ => { self.consume_temp_buffer(as_attribute); - - self.chars.unread(); + self.stream_prev(); return; } } } CcrState::NamedCharacterReference => { if let Some(entity) = self.find_entity() { - self.chars.skip(entity.len()); - let c = self.chars.look_ahead(0); + self.stream_next_n(entity.len()); + let c = self.stream.look_ahead(0); if as_attribute && !entity.ends_with(';') @@ -88,10 +92,10 @@ impl Tokenizer<'_> { self.temporary_buffer.clear(); if !entity.ends_with(';') { - // We need to return the position where we expected the ';' - self.chars.read_char(); // We can't use skip, as this might interfere with EOF stuff (fix it) - self.parse_error(ParserError::MissingSemicolonAfterCharacterReference); - self.chars.unread(); + self.parse_error( + ParserError::MissingSemicolonAfterCharacterReference, + self.get_location(), + ); } return; @@ -101,7 +105,7 @@ impl Tokenizer<'_> { ccr_state = CcrState::AmbiguousAmpersand; } CcrState::AmbiguousAmpersand => { - let c = self.read_char(); + let c = self.stream_read_and_next(); match c { // Element::Eof => return, Ch(ch) if ch.is_ascii_alphanumeric() => { @@ -112,12 +116,19 @@ impl Tokenizer<'_> { } } Ch(';') => { - self.parse_error(ParserError::UnknownNamedCharacterReference); - self.chars.unread(); + self.stream_prev(); + self.parse_error( + ParserError::UnknownNamedCharacterReference, + self.get_location(), + ); + return; + } + Character::StreamEnd => { + // self.consume_temp_buffer(as_attribute); return; } _ => { - self.chars.unread(); + self.stream_prev(); return; } } @@ -125,45 +136,82 @@ impl Tokenizer<'_> { CcrState::NumericCharacterReference => { char_ref_code = Some(0); - let c = self.read_char(); - if let Ch('X' | 'x') = c { - // Element::Eof => ccr_state = CcrState::NumericalCharacterReferenceEnd, - self.temporary_buffer.push(c.into()); - ccr_state = CcrState::HexadecimalCharacterReferenceStart; - } else { - self.chars.unread(); - ccr_state = CcrState::DecimalCharacterReferenceStart; + let c = self.stream_read_and_next(); + match c { + Ch('X' | 'x') => { + // Element::Eof => ccr_state = CcrState::NumericalCharacterReferenceEnd, + self.temporary_buffer.push(c.into()); + ccr_state = CcrState::HexadecimalCharacterReferenceStart; + } + Character::StreamEnd => { + ccr_state = CcrState::DecimalCharacterReferenceStart; + } + _ => { + self.stream_prev(); + ccr_state = CcrState::DecimalCharacterReferenceStart; + } } } CcrState::HexadecimalCharacterReferenceStart => { - let c = self.read_char(); - if let Ch('0'..='9' | 'A'..='F' | 'a'..='f') = c { - // Element::Eof => ccr_state = CcrState::NumericalCharacterReferenceEnd, - self.chars.unread(); - ccr_state = CcrState::HexadecimalCharacterReference; - } else { - self.parse_error(ParserError::AbsenceOfDigitsInNumericCharacterReference); - self.consume_temp_buffer(as_attribute); + let loc = self.get_location(); + let c = self.stream_read_and_next(); + match c { + Ch('0'..='9' | 'A'..='F' | 'a'..='f') => { + // Element::Eof => ccr_state = CcrState::NumericalCharacterReferenceEnd, + self.stream_prev(); + ccr_state = CcrState::HexadecimalCharacterReference; + } + Character::StreamEnd => { + self.parse_error( + ParserError::AbsenceOfDigitsInNumericCharacterReference, + loc, + ); + self.consume_temp_buffer(as_attribute); + return; + } + _ => { + self.parse_error( + ParserError::AbsenceOfDigitsInNumericCharacterReference, + loc, + ); + self.consume_temp_buffer(as_attribute); - self.chars.unread(); - return; + self.stream_prev(); + return; + } } } CcrState::DecimalCharacterReferenceStart => { - let c = self.read_char(); - if let Ch('0'..='9') = c { - self.chars.unread(); - ccr_state = CcrState::DecimalCharacterReference; - } else { - self.parse_error(ParserError::AbsenceOfDigitsInNumericCharacterReference); - self.consume_temp_buffer(as_attribute); - - self.chars.unread(); - return; + let loc = self.get_location(); + let c = self.stream_read_and_next(); + match c { + Ch('0'..='9') => { + self.stream_prev(); + ccr_state = CcrState::DecimalCharacterReference; + } + Character::StreamEnd => { + self.parse_error( + ParserError::AbsenceOfDigitsInNumericCharacterReference, + loc, + ); + self.consume_temp_buffer(as_attribute); + return; + } + _ => { + self.parse_error( + ParserError::AbsenceOfDigitsInNumericCharacterReference, + loc, + ); + self.consume_temp_buffer(as_attribute); + + self.stream_prev(); + return; + } } } CcrState::HexadecimalCharacterReference => { - let c = self.read_char(); + let loc = self.get_location(); + let c = self.stream_read_and_next(); match c { // Element::Eof => ccr_state = CcrState::NumericalCharacterReferenceEnd, Ch(c @ '0'..='9') => { @@ -193,15 +241,26 @@ impl Tokenizer<'_> { Ch(';') => { ccr_state = CcrState::NumericalCharacterReferenceEnd; } + Character::StreamEnd => { + self.parse_error( + ParserError::MissingSemicolonAfterCharacterReference, + loc, + ); + ccr_state = CcrState::NumericalCharacterReferenceEnd; + } _ => { - self.parse_error(ParserError::MissingSemicolonAfterCharacterReference); - self.chars.unread(); + self.parse_error( + ParserError::MissingSemicolonAfterCharacterReference, + loc, + ); + self.stream_prev(); ccr_state = CcrState::NumericalCharacterReferenceEnd; } } } CcrState::DecimalCharacterReference => { - let c = self.read_char(); + let loc = self.get_location(); + let c = self.stream_read_and_next(); match c { // Element::Eof => ccr_state = CcrState::NumericalCharacterReferenceEndState, Ch(c @ '0'..='9') => { @@ -215,10 +274,20 @@ impl Tokenizer<'_> { Ch(';') => { ccr_state = CcrState::NumericalCharacterReferenceEnd; } + Character::StreamEnd => { + self.parse_error( + ParserError::MissingSemicolonAfterCharacterReference, + loc, + ); + ccr_state = CcrState::NumericalCharacterReferenceEnd; + } _ => { - self.parse_error(ParserError::MissingSemicolonAfterCharacterReference); - self.chars.unread(); + self.parse_error( + ParserError::MissingSemicolonAfterCharacterReference, + loc, + ); ccr_state = CcrState::NumericalCharacterReferenceEnd; + self.stream_prev(); } } } @@ -227,35 +296,37 @@ impl Tokenizer<'_> { let mut char_ref_code = char_ref_code.unwrap_or(0); if char_ref_code == 0 && !overflow { - self.chars.read_char(); - self.parse_error(ParserError::NullCharacterReference); - self.chars.unread(); + self.parse_error(ParserError::NullCharacterReference, self.get_location()); char_ref_code = CHAR_REPLACEMENT as u32; } if char_ref_code > 0x10FFFF || overflow { - self.chars.read_char(); - self.parse_error(ParserError::CharacterReferenceOutsideUnicodeRange); - self.chars.unread(); + self.parse_error( + ParserError::CharacterReferenceOutsideUnicodeRange, + self.get_location(), + ); char_ref_code = CHAR_REPLACEMENT as u32; } if self.is_surrogate(char_ref_code) { - self.chars.read_char(); - self.parse_error(ParserError::SurrogateCharacterReference); - self.chars.unread(); + self.parse_error( + ParserError::SurrogateCharacterReference, + self.get_location(), + ); char_ref_code = CHAR_REPLACEMENT as u32; } if self.is_noncharacter(char_ref_code) { - self.chars.read_char(); - self.parse_error(ParserError::NoncharacterCharacterReference); - self.chars.unread(); + self.parse_error( + ParserError::NoncharacterCharacterReference, + self.get_location(), + ); // char_ref_code = CHAR_REPLACEMENT as u32; } if self.is_control_char(char_ref_code) || char_ref_code == 0x0D { - self.chars.read_char(); - self.parse_error(ParserError::ControlCharacterReference); - self.chars.unread(); + self.parse_error( + ParserError::ControlCharacterReference, + self.get_location(), + ); if TOKEN_REPLACEMENTS.contains_key(&char_ref_code) { char_ref_code = *TOKEN_REPLACEMENTS.get(&char_ref_code).unwrap() as u32; @@ -309,17 +380,17 @@ impl Tokenizer<'_> { /// Finds the longest entity from the current position in the stream. Returns the entity /// replacement OR None when no entity has been found. fn find_entity(&mut self) -> Option { - let s = self.chars.look_ahead_slice(*LONGEST_ENTITY_LENGTH); - let chars: Vec = s.chars().collect(); + let chars = self.stream.get_slice(*LONGEST_ENTITY_LENGTH); - for i in (0..=s.len()).rev() { + for i in (0..=chars.len()).rev() { if let Some(slice) = chars.get(0..i) { - let entity: String = slice.iter().collect(); + let entity: String = slice.iter().map(|c| c.to_string()).collect(); if TOKEN_NAMED_CHARS.contains_key(entity.as_str()) { return Some(entity); } } } + None } } @@ -335,7 +406,7 @@ lazy_static! { mod tests { use crate::error_logger::ErrorLogger; use crate::tokenizer::{ParserData, Tokenizer}; - use gosub_shared::bytes::CharIterator; + use gosub_shared::byte_stream::ByteStream; use std::cell::RefCell; use std::rc::Rc; @@ -346,11 +417,12 @@ mod tests { fn $name() { let (input, expected) = $value; - let mut chars = CharIterator::new(); - chars.read_from_str(input, None); + let mut stream = ByteStream::new(); + stream.read_from_str(input, None); + stream.close(); let error_logger = Rc::new(RefCell::new(ErrorLogger::new())); - let mut tokenizer = Tokenizer::new(&mut chars, None, error_logger.clone()); + let mut tokenizer = Tokenizer::new(&mut stream, None, error_logger.clone()); let token = tokenizer.next_token(ParserData::default()).unwrap(); assert_eq!(expected, token.to_string()); @@ -398,12 +470,11 @@ mod tests { entity_111: ("©a", "©a") entity_112: ("©a;", "©a;") entity_113: ("©", "©") - // entity_114: ("©&", "©&") + entity_114: ("©&", "©&") entity_115: ("©a ", "©a ") entity_116: ("©X ", "©X ") - - // ChatGPT generated tests + // // ChatGPT generated tests entity_200: ("©", "©") entity_201: ("© ", "© ") entity_202: ("©", "©") diff --git a/crates/gosub_html5/src/tokenizer/tests.rs b/crates/gosub_html5/src/tokenizer/tests.rs index f47afb336..e5f9cf56f 100644 --- a/crates/gosub_html5/src/tokenizer/tests.rs +++ b/crates/gosub_html5/src/tokenizer/tests.rs @@ -63,7 +63,6 @@ fn tokenization(filename: &str) { test.tokenize(); continue; } - test.assert_valid(); } } diff --git a/crates/gosub_renderer/src/render_tree.rs b/crates/gosub_renderer/src/render_tree.rs index 84b75f90f..1e4022118 100644 --- a/crates/gosub_renderer/src/render_tree.rs +++ b/crates/gosub_renderer/src/render_tree.rs @@ -10,7 +10,7 @@ use gosub_html5::parser::Html5Parser; use gosub_net::http::ureq; use gosub_render_backend::{RenderBackend, SizeU32}; use gosub_rendering::position::PositionTree; -use gosub_shared::bytes::{CharIterator, Confidence, Encoding}; +use gosub_shared::byte_stream::{ByteStream, Confidence, Encoding}; use gosub_styling::css_values::CssProperties; use gosub_styling::render_tree::{generate_render_tree, RenderNodeData, RenderTree as StyleTree}; @@ -84,12 +84,14 @@ pub(crate) fn load_html_rendertree( bail!("Unsupported url scheme: {}", url.scheme()); }; - let mut chars = CharIterator::new(); - chars.read_from_str(&html, Some(Encoding::UTF8)); - chars.set_confidence(Confidence::Certain); + let mut stream = ByteStream::new(); + stream.read_from_str(&html, Some(Encoding::UTF8)); + stream.set_confidence(Confidence::Certain); + stream.close(); let mut doc_handle = DocumentBuilder::new_document(Some(url)); - let parse_errors = Html5Parser::parse_document(&mut chars, Document::clone(&doc_handle), None)?; + let parse_errors = + Html5Parser::parse_document(&mut stream, Document::clone(&doc_handle), None)?; for error in parse_errors { eprintln!("Parse error: {:?}", error); diff --git a/crates/gosub_shared/src/byte_stream.rs b/crates/gosub_shared/src/byte_stream.rs index b7ef0ac62..50d70cee4 100644 --- a/crates/gosub_shared/src/byte_stream.rs +++ b/crates/gosub_shared/src/byte_stream.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; +use std::fmt::{Debug, Formatter}; use std::io::Read; use std::{fmt, io}; @@ -42,6 +44,16 @@ use Character::*; /// Converts the given character to a char. This is only valid for UTF8 characters. Surrogate /// and EOF characters are converted to 0x0000 +impl From<&Character> for char { + fn from(c: &Character) -> Self { + match c { + Ch(c) => *c, + Surrogate(..) => 0x0000 as char, + StreamEmpty | StreamEnd => 0x0000 as char, + } + } +} + impl From for char { fn from(c: Character) -> Self { match c { @@ -64,11 +76,19 @@ impl fmt::Display for Character { } impl Character { + /// Returns true when the character is a whitespace pub fn is_whitespace(&self) -> bool { - matches!(self, Self::Ch(c) if c.is_whitespace()) + matches!(self, Ch(c) if c.is_whitespace()) } + + /// Returns true when the character is a numerical pub fn is_numeric(&self) -> bool { - matches!(self, Self::Ch(c) if c.is_numeric()) + matches!(self, Ch(c) if c.is_numeric()) + } + + /// Converts a slice of characters into a string + pub fn slice_to_string(v: &[Character]) -> String { + v.iter().map(char::from).collect() } } @@ -103,8 +123,10 @@ pub trait Stream { fn prev(&mut self); /// Unread n characters fn prev_n(&mut self, n: usize); + // Seek to a specific position + fn seek(&mut self, pos: usize); // Returns a slice - fn get_slice(&self, start: usize, end: usize) -> &[Character]; + fn get_slice(&self, len: usize) -> &[Character]; /// Resets the stream back to the start position fn reset_stream(&mut self); /// Closes the stream (no more data can be added) @@ -152,6 +174,16 @@ impl Stream for ByteStream { c } + /// Seeks to a specific position in the stream + fn seek(&mut self, pos: usize) { + if pos >= self.buffer.len() { + self.buffer_pos = self.buffer.len(); + return; + } + + self.buffer_pos = pos; + } + /// Looks ahead in the stream, can use an optional index if we want to seek further /// (or back) in the stream. fn look_ahead(&self, offset: usize) -> Character { @@ -203,8 +235,12 @@ impl Stream for ByteStream { } /// Retrieves a slice of the buffer - fn get_slice(&self, start: usize, end: usize) -> &[Character] { - &self.buffer[start..end] + fn get_slice(&self, len: usize) -> &[Character] { + if self.buffer_pos + len > self.buffer.len() { + return &self.buffer[self.buffer_pos..]; + } + + &self.buffer[self.buffer_pos..self.buffer_pos + len] } /// Resets the stream to the first character of the stream @@ -275,6 +311,7 @@ impl ByteStream { self.close(); self.force_set_encoding(e.unwrap_or(Encoding::UTF8)); self.reset_stream(); + self.close(); Ok(()) } @@ -291,6 +328,10 @@ impl ByteStream { self.force_set_encoding(e.unwrap_or(Encoding::UTF8)); } + pub fn close(&mut self) { + self.closed = true; + } + /// Normalizes newlines (CRLF/CR => LF) and converts high ascii to '?' fn normalize_newlines_and_ascii(&self, buffer: &[u8]) -> Vec { let mut result = Vec::with_capacity(buffer.len()); @@ -404,6 +445,95 @@ impl ByteStream { } } +/// Location holds the start position of the given element in the data source +#[derive(Clone, PartialEq)] +pub struct Location { + /// Line number, starting with 1 + pub line: usize, + /// Column number, starting with 1 + pub column: usize, + /// Byte offset, starting with 0 + pub offset: usize, +} + +impl Default for Location { + /// Default to line 1, column 1 + fn default() -> Self { + Self::new(1, 1, 0) + } +} + +impl Location { + /// Create a new Location + pub fn new(line: usize, column: usize, offset: usize) -> Self { + Self { + line, + column, + offset, + } + } +} + +impl Debug for Location { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "({}:{})", self.line, self.column) + } +} + +/// LocationHandler is a wrapper that will deal with line/column locations in the stream +pub struct LocationHandler { + pub start_location: Location, + pub cur_location: Location, + pub line_columns: HashMap, +} + +impl LocationHandler { + /// Create a new LocationHandler. Start_location can be set in case the stream is + /// not starting at 1:1 + pub fn new(start_location: Location) -> Self { + Self { + start_location, + cur_location: Location::default(), + line_columns: HashMap::new(), + } + } + + pub fn inc(&mut self, ch: Character) { + match ch { + Ch(CHAR_LF) => { + self.line_columns + .insert(self.cur_location.line, self.cur_location.column); + + self.cur_location.line += 1; + self.cur_location.column = 1; + self.cur_location.offset += 1; + } + Ch(_) => { + self.cur_location.column += 1; + self.cur_location.offset += 1; + } + StreamEnd | StreamEmpty => {} + _ => {} + } + } + + pub fn dec(&mut self) { + if self.cur_location.offset == 0 { + return; + } + + if self.cur_location.column == 1 { + self.cur_location.line -= 1; + self.cur_location.column = + *self.line_columns.get(&self.cur_location.line).unwrap_or(&1); + } else { + self.cur_location.column -= 1; + } + + self.cur_location.offset -= 1; + } +} + #[cfg(test)] mod test { use super::*; @@ -565,6 +695,7 @@ mod test { assert_eq!(stream.read_and_next(), Ch('h')); assert_eq!(stream.read_and_next(), Ch('i')); assert!(matches!(stream.read_and_next(), StreamEnd)); + assert!(matches!(stream.read_and_next(), StreamEnd)); } #[test] diff --git a/crates/gosub_shared/src/bytes.rs b/crates/gosub_shared/src/bytes.rs deleted file mode 100644 index 0bbee5dc6..000000000 --- a/crates/gosub_shared/src/bytes.rs +++ /dev/null @@ -1,587 +0,0 @@ -use std::collections::HashMap; -use std::io::Read; -use std::iter::Iterator; -use std::{fmt, io}; - -pub const CHAR_LF: char = '\u{000A}'; -pub const CHAR_CR: char = '\u{000D}'; - -/// Encoding defines the way the buffer stream is read, as what defines a "character". -#[derive(PartialEq)] -pub enum Encoding { - /// Stream is of UTF8 characters - UTF8, - /// Stream consists of 8-bit ASCII characters - ASCII, -} - -/// The confidence decides how confident we are that the input stream is of this encoding -#[derive(PartialEq)] -pub enum Confidence { - /// This encoding might be the one we need - Tentative(f32), - /// We are certain to use this encoding - Certain, -} - -/// This struct defines a position in the stream. POsition itself is 0-based, but line and col are -/// 1-based and are calculated from the line_offsets vector. -#[derive(Clone, Copy, Debug, PartialEq)] -pub struct Position { - /// Offset in the stream - pub offset: usize, - /// Line number (1-based) - pub line: usize, - /// Column number (1-based) - pub col: usize, -} - -impl Position { - /// Create a new position - #[must_use] - pub fn new(offset: usize, line: usize, col: usize) -> Self { - Self { offset, line, col } - } -} - -impl fmt::Display for Position { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}:{}:{}", self.offset, self.line, self.col) - } -} - -/// Defines a single character/element in the stream. This is either a UTF8 character, or -/// a surrogate characters since these cannot be stored in a single char. -/// Eof is denoted as a separate element. -#[derive(Clone, Copy, Debug, PartialEq)] -pub enum Bytes { - /// Standard UTF character - Ch(char), - /// Surrogate character (since they cannot be stored in char) - Surrogate(u16), - /// End of stream - Eof, -} - -use Bytes::*; - -/// Converts the given character to a char. This is only valid for UTF8 characters. Surrogate -/// and EOF characters are converted to 0x0000 -impl From for char { - fn from(c: Bytes) -> Self { - match c { - Ch(c) => c, - Bytes::Surrogate(..) | Eof => 0x0000 as char, - } - } -} - -impl fmt::Display for Bytes { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Ch(ch) => write!(f, "{ch}"), - Bytes::Surrogate(surrogate) => write!(f, "U+{surrogate:04X}"), - Eof => write!(f, "EOF"), - } - } -} - -impl Bytes { - pub fn is_whitespace(&self) -> bool { - matches!(self, Self::Ch(c) if c.is_whitespace()) - } - - pub fn is_numeric(&self) -> bool { - matches!(self, Self::Ch(c) if c.is_numeric()) - } -} - -/// Buffered UTF-8 iterator -pub struct CharIterator { - /// Current encoding - pub encoding: Encoding, - /// How confident are we that this is the correct encoding? - pub confidence: Confidence, - /// Current positions - pub position: Position, - /// Length (in chars) of the buffer - pub length: usize, - /// Offsets of the given lines - line_columns: HashMap, - /// Reference to the actual buffer stream in characters - buffer: Vec, - /// Reference to the actual buffer stream in u8 bytes - u8_buffer: Vec, - /// If all things are ok, both buffer and u8_buffer should refer to the same memory location (?) - pub has_read_eof: bool, // True when we just read an EOF -} - -impl Default for CharIterator { - fn default() -> Self { - Self::new() - } -} - -impl Iterator for CharIterator { - type Item = char; - - fn next(&mut self) -> Option { - if self.eof() || self.position.offset >= self.length { - return None; - } - - // SAFETY: self.buffer and self.u8_buffer have the same length - let c = self.u8_buffer[self.position.offset] as char; - - if c == '\n' { - // Store line offset for the given line - self.line_columns - .insert(self.position.line, self.position.col); - // And continue position on the next line - self.position.line += 1; - self.position.col = 1; - } else { - self.position.col += 1; - } - - self.position.offset += 1; - Some(c) - } -} - -impl CharIterator { - /// Create a new default empty input stream - #[must_use] - pub fn new() -> Self { - Self { - encoding: Encoding::UTF8, - confidence: Confidence::Tentative(0.0), - position: Position { - offset: 0, - line: 1, - col: 1, - }, - length: 0, - line_columns: HashMap::new(), - buffer: Vec::new(), - u8_buffer: Vec::new(), - has_read_eof: false, - } - } - /// Returns true when the encoding encountered is defined as certain - pub fn is_certain_encoding(&self) -> bool { - self.confidence == Confidence::Certain - } - - /// Detect the given encoding from stream analysis - pub fn detect_encoding(&mut self) { - let encoding = chardet::detect(&self.u8_buffer); - match encoding.0.as_str() { - "ascii" => self.encoding = Encoding::ASCII, - "utf-8" => self.encoding = Encoding::UTF8, - _ => {} - }; - match encoding.1 { - p if p >= 0.99 => self.confidence = Confidence::Certain, - p => self.confidence = Confidence::Tentative(p), - }; - } - - /// Returns true when the stream pointer is at the end of the stream - pub fn eof(&self) -> bool { - self.has_read_eof || self.position.offset >= self.length - } - - /// Reset the stream reader back to the start - pub fn reset(&mut self) { - self.position.offset = 0; - self.position.line = 1; - self.position.col = 1; - } - - /// Skip offset characters in the stream (based on chars) - pub fn skip(&mut self, offset: usize) { - let mut skip_len = offset; - if self.position.offset + offset >= self.length { - skip_len = self.length - self.position.offset; - } - - for _ in 0..skip_len { - self.read_char(); - } - } - - /// Returns the previous position based on the current position - pub fn get_previous_position(&mut self) -> Position { - // if we are at the beginning or the end of the stream, we just return the current position - if self.position.offset == 0 || self.has_read_eof { - return self.position; - } - - self.unread(); - let pos = self.position; - self.skip(1); - - pos - } - - /// Returns the current offset in the stream - pub fn tell(&self) -> usize { - self.position.offset - } - - /// Set the given confidence of the input stream encoding - pub fn set_confidence(&mut self, c: Confidence) { - self.confidence = c; - } - - /// Changes the encoding and if necessary, decodes the u8 buffer into the correct encoding - pub fn set_encoding(&mut self, e: Encoding) { - // Don't convert if the encoding is the same as it already is - if self.encoding == e { - return; - } - - self.force_set_encoding(e); - } - - /// Sets the encoding for this stream, and decodes the u8_buffer into the buffer with the - /// correct encoding. - pub fn force_set_encoding(&mut self, e: Encoding) { - match e { - Encoding::UTF8 => { - let str_buf = unsafe { - std::str::from_utf8_unchecked(&self.u8_buffer) - .replace("\u{000D}\u{000A}", "\u{000A}") - .replace('\u{000D}', "\u{000A}") - }; - - // Convert the utf8 string into characters so we can use easy indexing - self.buffer = str_buf - .chars() - .map(|c| { - // // Check if we have a non-bmp character. This means it's above 0x10000 - // let cp = c as u32; - // if cp > 0x10000 && cp <= 0x10FFFF { - // let adjusted = cp - 0x10000; - // let lead = ((adjusted >> 10) & 0x3FF) as u16 + 0xD800; - // let trail = (adjusted & 0x3FF) as u16 + 0xDC00; - // self.buffer.push(Element::Surrogate(lead)); - // self.buffer.push(Element::Surrogate(trail)); - // continue; - // } - - if (0xD800..=0xDFFF).contains(&(c as u32)) { - Bytes::Surrogate(c as u16) - } else { - Ch(c) - } - }) - .collect::>(); - self.length = self.buffer.len(); - } - Encoding::ASCII => { - // Convert the string into characters so we can use easy indexing. Any non-ascii chars (> 0x7F) are converted to '?' - self.buffer = self.normalize_newlines_and_ascii(&self.u8_buffer); - self.length = self.buffer.len(); - } - } - - self.encoding = e; - } - - /// Normalizes newlines (CRLF/CR => LF) and converts high ascii to '?' - fn normalize_newlines_and_ascii(&self, buffer: &[u8]) -> Vec { - let mut result = Vec::with_capacity(buffer.len()); - - for i in 0..buffer.len() { - if buffer[i] == CHAR_CR as u8 { - // convert CR to LF, or CRLF to LF - if i + 1 < buffer.len() && buffer[i + 1] == CHAR_LF as u8 { - continue; - } - result.push(Ch(CHAR_LF)); - } else if buffer[i] >= 0x80 { - // Convert high ascii to ? - result.push(Ch('?')); - } else { - // everything else is ok - result.push(Ch(buffer[i] as char)); - } - } - - result - } - - /// Read directly from bytes - pub fn read_from_bytes(&mut self, bytes: &[u8], e: Option) -> io::Result<()> { - self.u8_buffer = bytes.to_vec(); - self.force_set_encoding(e.unwrap_or(Encoding::UTF8)); - self.reset(); - Ok(()) - } - - /// Populates the current buffer with the contents of given file f - pub fn read_from_file(&mut self, mut f: impl Read, e: Option) -> io::Result<()> { - // First we read the u8 bytes into a buffer - f.read_to_end(&mut self.u8_buffer).expect("uh oh"); - self.force_set_encoding(e.unwrap_or(Encoding::UTF8)); - self.reset(); - Ok(()) - } - - /// Populates the current buffer with the contents of the given string s - pub fn read_from_str(&mut self, s: &str, e: Option) { - self.u8_buffer = Vec::from(s.as_bytes()); - self.force_set_encoding(e.unwrap_or(Encoding::UTF8)); - self.reset(); - } - - /// Returns the number of characters left in the buffer - #[cfg(test)] - fn chars_left(&self) -> usize { - self.length - self.position.offset - } - - /// - /// Reads a character and increases the current pointer, or read EOF as None - pub fn read_char(&mut self) -> Bytes { - // Return none if we already have read EOF - if self.has_read_eof { - return Eof; - } - - // If we still can move forward in the stream, move forwards - if self.position.offset < self.length { - let c = self.buffer[self.position.offset]; - if c == Ch('\n') { - // Store line offset for the given line - self.line_columns - .insert(self.position.line, self.position.col); - // And continue position on the next line - self.position.line += 1; - self.position.col = 1; - } else { - self.position.col += 1; - } - self.position.offset += 1; - return c; - } - - // otherwise, we have reached the end of the stream - self.has_read_eof = true; - - Eof - } - - pub fn unread(&mut self) { - // We already read eof, so "unread" the eof by unsetting the flag - if self.has_read_eof { - self.has_read_eof = false; - return; - } - - // If we can track back from the offset, we can do so - if self.position.offset > 0 { - self.position.offset -= 1; - - if self.position.col == 1 { - self.position.line -= 1; - let key = self.position.line; - self.position.col = *self.line_columns.get(&key).unwrap_or(&1); - } else { - self.position.col -= 1; - } - } - } - - /// Looks ahead in the stream and returns len characters - pub fn look_ahead_slice(&self, len: usize) -> String { - let end_pos = std::cmp::min(self.length, self.position.offset + len); - - let slice = &self.buffer[self.position.offset..end_pos]; - slice.iter().map(ToString::to_string).collect() - } - - /// Looks ahead in the stream, can use an optional index if we want to seek further - /// (or back) in the stream. - pub fn look_ahead(&self, offset: usize) -> Bytes { - // Trying to look after the stream - if self.position.offset + offset >= self.length { - return Eof; - } - - self.buffer[self.position.offset + offset] - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_stream() { - let mut chars = CharIterator::new(); - assert!(chars.eof()); - - chars.read_from_str("foo", Some(Encoding::ASCII)); - assert_eq!(chars.length, 3); - assert!(!chars.eof()); - assert_eq!(chars.chars_left(), 3); - - chars.read_from_str("f👽f", Some(Encoding::UTF8)); - assert_eq!(chars.length, 3); - assert!(!chars.eof()); - assert_eq!(chars.chars_left(), 3); - assert_eq!(chars.read_char(), Ch('f')); - assert_eq!(chars.chars_left(), 2); - assert!(!chars.eof()); - assert_eq!(chars.read_char(), Ch('👽')); - assert!(!chars.eof()); - assert_eq!(chars.chars_left(), 1); - assert_eq!(chars.read_char(), Ch('f')); - assert!(chars.eof()); - assert_eq!(chars.chars_left(), 0); - - chars.reset(); - chars.set_encoding(Encoding::ASCII); - assert_eq!(chars.length, 6); - assert_eq!(chars.read_char(), Ch('f')); - assert_eq!(chars.read_char(), Ch('?')); - assert_eq!(chars.read_char(), Ch('?')); - assert_eq!(chars.read_char(), Ch('?')); - assert_eq!(chars.read_char(), Ch('?')); - assert_eq!(chars.read_char(), Ch('f')); - assert!(matches!(chars.read_char(), Eof)); - - chars.unread(); // unread eof - chars.unread(); // unread 'f' - chars.unread(); // Unread '?' - assert_eq!(chars.chars_left(), 2); - chars.unread(); - assert_eq!(chars.chars_left(), 3); - - chars.reset(); - assert_eq!(chars.chars_left(), 6); - chars.unread(); - assert_eq!(chars.chars_left(), 6); - - chars.read_from_str("abc", Some(Encoding::UTF8)); - chars.reset(); - assert_eq!(chars.read_char(), Ch('a')); - chars.unread(); - assert_eq!(chars.read_char(), Ch('a')); - assert_eq!(chars.read_char(), Ch('b')); - chars.unread(); - assert_eq!(chars.read_char(), Ch('b')); - assert_eq!(chars.read_char(), Ch('c')); - chars.unread(); - assert_eq!(chars.read_char(), Ch('c')); - assert!(matches!(chars.read_char(), Eof)); - chars.unread(); - assert!(matches!(chars.read_char(), Eof)); - } - - #[test] - fn test_certainty() { - let mut chars = CharIterator::new(); - assert!(!chars.is_certain_encoding()); - - chars.set_confidence(Confidence::Certain); - assert!(chars.is_certain_encoding()); - - chars.set_confidence(Confidence::Tentative(0.5)); - assert!(!chars.is_certain_encoding()); - } - - #[test] - fn test_eof() { - let mut chars = CharIterator::new(); - chars.read_from_str("abc", Some(Encoding::UTF8)); - assert_eq!(chars.length, 3); - assert_eq!(chars.chars_left(), 3); - assert_eq!(chars.read_char(), Ch('a')); - assert_eq!(chars.read_char(), Ch('b')); - assert_eq!(chars.read_char(), Ch('c')); - assert!(matches!(chars.read_char(), Eof)); - assert!(matches!(chars.read_char(), Eof)); - assert!(matches!(chars.read_char(), Eof)); - assert!(matches!(chars.read_char(), Eof)); - chars.unread(); - assert!(matches!(chars.read_char(), Eof)); - chars.unread(); - chars.unread(); - assert!(!matches!(chars.read_char(), Eof)); - assert!(matches!(chars.read_char(), Eof)); - chars.unread(); - chars.unread(); - assert!(!matches!(chars.read_char(), Eof)); - chars.unread(); - chars.unread(); - chars.unread(); - assert_eq!(chars.read_char(), Ch('a')); - chars.unread(); - assert_eq!(chars.read_char(), Ch('a')); - chars.unread(); - chars.unread(); - assert_eq!(chars.read_char(), Ch('a')); - chars.unread(); - chars.unread(); - chars.unread(); - chars.unread(); - chars.unread(); - chars.unread(); - assert_eq!(chars.read_char(), Ch('a')); - assert_eq!(chars.read_char(), Ch('b')); - assert_eq!(chars.read_char(), Ch('c')); - assert!(matches!(chars.read_char(), Eof)); - chars.unread(); - chars.unread(); - assert_eq!(chars.read_char(), Ch('c')); - assert!(matches!(chars.read_char(), Eof)); - chars.unread(); - assert!(matches!(chars.read_char(), Eof)); - } - - #[test] - fn test_detect_encoding() { - let mut chars = CharIterator::new(); - chars.read_from_str("abc", Some(Encoding::UTF8)); - chars.detect_encoding(); - assert!(matches!(chars.encoding, Encoding::ASCII)); - assert!(matches!(chars.confidence, Confidence::Certain)); - - let mut chars = CharIterator::new(); - chars.read_from_str("abc浏览器", Some(Encoding::UTF8)); - chars.detect_encoding(); - assert!(matches!(chars.encoding, Encoding::UTF8)); - assert!(matches!(chars.confidence, Confidence::Tentative(_))); - } - - #[test] - fn test_iter() { - let mut chars = CharIterator::new(); - chars.read_from_str("abc", Some(Encoding::UTF8)); - assert_eq!(chars.next(), Some('a')); - assert_eq!(chars.next(), Some('b')); - assert_eq!(chars.next(), Some('c')); - assert_eq!(chars.next(), None); - assert!(chars.eof()); - } - - #[test] - fn test_peekable() { - let mut chars = CharIterator::new(); - chars.read_from_str("abc", Some(Encoding::UTF8)); - let mut peekable = chars.peekable(); - assert_eq!(peekable.peek(), Some(&'a')); - assert_eq!(peekable.next(), Some('a')); - assert_eq!(peekable.peek(), Some(&'b')); - assert_eq!(peekable.next(), Some('b')); - let nxt = peekable.peek_mut().unwrap(); - *nxt = 'd'; - assert_eq!(peekable.peek(), Some(&'d')); - assert_eq!(peekable.next(), Some('d')); - assert_eq!(peekable.next(), None); - } -} diff --git a/crates/gosub_shared/src/lib.rs b/crates/gosub_shared/src/lib.rs index 1f25ae672..9e46b5c77 100644 --- a/crates/gosub_shared/src/lib.rs +++ b/crates/gosub_shared/src/lib.rs @@ -3,7 +3,5 @@ //! This crate supplies a lot of shared functionality in the gosub engine. //! pub mod byte_stream; -pub mod bytes; - pub mod timing; pub mod types; diff --git a/crates/gosub_shared/src/types.rs b/crates/gosub_shared/src/types.rs index c953efeab..59ec134af 100644 --- a/crates/gosub_shared/src/types.rs +++ b/crates/gosub_shared/src/types.rs @@ -1,4 +1,5 @@ //! Error results that can be returned from the engine +use crate::byte_stream::Location; use thiserror::Error; /// Parser error that defines an error (message) on the given position @@ -6,12 +7,8 @@ use thiserror::Error; pub struct ParseError { /// Parse error message pub message: String, - /// Line number (1-based) of the error - pub line: usize, - // Column (1-based) on line of the error - pub col: usize, - // Position (0-based) of the error in the input stream - pub offset: usize, + /// Location of the error + pub location: Location, } /// Serious errors and errors from third-party libraries diff --git a/crates/gosub_testing/src/testing/tokenizer.rs b/crates/gosub_testing/src/testing/tokenizer.rs index d46c65970..1d4fe8333 100644 --- a/crates/gosub_testing/src/testing/tokenizer.rs +++ b/crates/gosub_testing/src/testing/tokenizer.rs @@ -8,7 +8,7 @@ use gosub_html5::{ {Options, Tokenizer}, }, }; -use gosub_shared::bytes::CharIterator; +use gosub_shared::byte_stream::ByteStream; use gosub_shared::types::Result; use lazy_static::lazy_static; use regex::{Captures, Regex}; @@ -24,7 +24,7 @@ use std::{ }; pub struct TokenizerBuilder { - chars: CharIterator, + stream: ByteStream, state: TokenState, last_start_tag: Option, } @@ -33,7 +33,7 @@ impl TokenizerBuilder { pub fn build(&mut self) -> Tokenizer<'_> { let error_logger = Rc::new(RefCell::new(ErrorLogger::new())); Tokenizer::new( - &mut self.chars, + &mut self.stream, Some(Options { initial_state: self.state, last_start_tag: self.last_start_tag.clone().unwrap_or_default(), @@ -185,16 +185,17 @@ impl TestSpec { } for state in states { - let mut chars = CharIterator::new(); + let mut stream = ByteStream::new(); let input = if self.double_escaped { from_utf16_lossy(&self.input) } else { self.input.to_string() }; - chars.read_from_str(input.as_str(), None); + stream.read_from_str(input.as_str(), None); + stream.close(); let builder = TokenizerBuilder { - chars, + stream, last_start_tag: self.last_start_tag.clone(), state, }; @@ -218,7 +219,12 @@ impl TestSpec { // There can be multiple tokens to match. Make sure we match all of them for expected in &self.output { let actual = tokenizer.next_token(ParserData::default()).unwrap(); - assert_eq!(self.escape(&actual), self.escape(expected)); + assert_eq!( + self.escape(&actual), + self.escape(expected), + "build state: {:?}", + builder.state + ); } let borrowed_error_logger = tokenizer.error_logger.borrow(); @@ -247,8 +253,8 @@ impl TestSpec { // Iterate all generated errors to see if we have an exact match for actual in tokenizer.get_error_logger().get_errors() { if actual.message == expected.code - && actual.line == expected.line - && actual.col == expected.col + && actual.location.line == expected.line + && actual.location.column == expected.col { return; } @@ -258,7 +264,7 @@ impl TestSpec { // it's not always correct, it might be a off-by-one position. for actual in tokenizer.get_error_logger().get_errors() { if actual.message == expected.code - && (actual.line != expected.line || actual.col != expected.col) + && (actual.location.line != expected.line || actual.location.column != expected.col) { panic!( "[{}]: wanted {:?}, got {:?}", diff --git a/crates/gosub_testing/src/testing/tree_construction.rs b/crates/gosub_testing/src/testing/tree_construction.rs index b903119ea..262d1bb99 100644 --- a/crates/gosub_testing/src/testing/tree_construction.rs +++ b/crates/gosub_testing/src/testing/tree_construction.rs @@ -10,7 +10,7 @@ use gosub_html5::parser::document::DocumentBuilder; use gosub_html5::parser::document::{Document, DocumentHandle}; use gosub_html5::parser::tree_builder::TreeBuilder; use gosub_html5::parser::{Html5Parser, Html5ParserOptions}; -use gosub_shared::bytes::CharIterator; +use gosub_shared::byte_stream::ByteStream; use gosub_shared::types::{ParseError, Result}; use parser::{ScriptMode, TestSpec}; use result::TestResult; @@ -87,19 +87,22 @@ impl Harness { /// Run the html5 parser and return the document tree and errors fn do_parse(&mut self, scripting_enabled: bool) -> Result<(DocumentHandle, Vec)> { let options = Html5ParserOptions { scripting_enabled }; - let mut chars = CharIterator::new(); - chars.read_from_str(self.test.spec_data(), None); + let mut stream = ByteStream::new(); + stream.read_from_str(self.test.spec_data(), None); + stream.close(); - let (document, parse_errors) = if let Some(fragment) = - self.test.spec.document_fragment.clone() - { - self.parse_fragment(fragment, chars, options)? - } else { - let document = DocumentBuilder::new_document(None); - let parser_errors = - Html5Parser::parse_document(&mut chars, Document::clone(&document), Some(options))?; - (document, parser_errors) - }; + let (document, parse_errors) = + if let Some(fragment) = self.test.spec.document_fragment.clone() { + self.parse_fragment(fragment, stream, options)? + } else { + let document = DocumentBuilder::new_document(None); + let parser_errors = Html5Parser::parse_document( + &mut stream, + Document::clone(&document), + Some(options), + )?; + (document, parser_errors) + }; Ok((document, parse_errors)) } @@ -107,7 +110,7 @@ impl Harness { fn parse_fragment( &mut self, fragment: String, - mut chars: CharIterator, + mut stream: ByteStream, options: Html5ParserOptions, ) -> Result<(DocumentHandle, Vec)> { // First, create a (fake) main document that contains only the fragment as node @@ -139,7 +142,7 @@ impl Harness { let document = DocumentBuilder::new_document_fragment(&context_node); let parser_errors = Html5Parser::parse_fragment( - &mut chars, + &mut stream, Document::clone(&document), &context_node, Some(options), diff --git a/docs/parsing.md b/docs/parsing.md index 35d2cc549..a77b69417 100644 --- a/docs/parsing.md +++ b/docs/parsing.md @@ -3,18 +3,15 @@ Parsing a HTML5 site is not difficult, although it currently require some manual work. Later on, this will be encapsulated in the engine API. First, we need to fetch the actual HTML content. This can be done by a simple HTTP request, or reading a file from disk. These HTML bytes must be -passed to the char streamer: +passed to the byte streamer so it can be converted to tokens without worrying about the encoding: ```rust - - let mut chars = CharIterator::new(); - chars.read_from_str(&html, Some(Encoding::UTF8)); + let stream = &mut ByteStream::new(); ``` -Here, the &html points to a string containing the HTML content. The `CharIterator` will take care of converting the bytes to characters, and handle the encoding. +Here, the `stream` points to a string containing the HTML content. The `ByteStream` will take care of converting the bytes to characters, and handle the encoding. We assume UTF-8 here, but other encodings could be supported later on as well. - Next, we need to create a document, which will be the main object that will be filled by the parser. The document will contain all the node elements and other data that is generated during the parsing of the HTML. This also includes any stylesheets that are found, both internally and externally. @@ -26,7 +23,7 @@ Note that a document itself isn't a document, but a HANDLE to a document (a `Doc by calling the `parse_document` method on the `Html5Parser` struct. This method will return a list of parse errors, if any. ```rust - let parse_errors = Html5Parser::parse_document(&mut chars, Document::clone(&document), None)?; + let parse_errors = Html5Parser::parse_document(&mut stream, Document::clone(&document), None)?; for e in parse_errors { println!("Parse Error: {}", e.message); diff --git a/examples/html5-parser.rs b/examples/html5-parser.rs index 1945d0917..259f35374 100644 --- a/examples/html5-parser.rs +++ b/examples/html5-parser.rs @@ -1,15 +1,16 @@ use gosub_html5::parser::document::{Document, DocumentBuilder}; use gosub_html5::parser::Html5Parser; -use gosub_shared::bytes::{CharIterator, Encoding}; +use gosub_shared::byte_stream::{ByteStream, Encoding}; fn main() { // Creates an input stream - let mut chars = CharIterator::new(); - chars.read_from_str("

Helloworld

", Some(Encoding::UTF8)); + let mut stream = ByteStream::new(); + stream.read_from_str("

Helloworld

", Some(Encoding::UTF8)); + stream.close(); // Initialize a document and feed it together with the stream to the html5 parser let document = DocumentBuilder::new_document(None); - let _ = Html5Parser::parse_document(&mut chars, Document::clone(&document), None); + let _ = Html5Parser::parse_document(&mut stream, Document::clone(&document), None); // document now contains the html5 node tree println!("Generated tree: \n\n {}", document); diff --git a/src/bin/css3-parser.rs b/src/bin/css3-parser.rs index 9b9b5c2a1..af1b62feb 100644 --- a/src/bin/css3-parser.rs +++ b/src/bin/css3-parser.rs @@ -1,9 +1,8 @@ use anyhow::{anyhow, bail, Result}; -use gosub_css3::location::Location; use gosub_css3::parser_config::ParserConfig; use gosub_css3::tokenizer::{TokenType, Tokenizer}; use gosub_css3::{walker, Css3, Error}; -use gosub_shared::byte_stream::{ByteStream, Encoding, Stream}; +use gosub_shared::byte_stream::{ByteStream, Encoding, Location}; use simple_logger::SimpleLogger; use std::fs; @@ -102,12 +101,8 @@ fn main() -> Result<()> { fn display_snippet(css: &str, err: Error) { let loc = err.location.clone(); let lines: Vec<&str> = css.split('\n').collect(); - let line_nr = loc.line() - 1; - let col_nr = if loc.column() < 2 { - 0 - } else { - loc.column() - 2 - }; + let line_nr = loc.line - 1; + let col_nr = if loc.column < 2 { 0 } else { loc.column - 2 }; if col_nr > 1000 { println!("Error is too far to the right to display."); @@ -125,26 +120,26 @@ fn display_snippet(css: &str, err: Error) { } // Print the line with the error and a pointer to the error - println!("{:<5}|{}", line_nr + 1, lines[line_nr as usize]); - println!(" ---{}^", "-".repeat(col_nr as usize)); + println!("{:<5}|{}", line_nr + 1, lines[line_nr]); + println!(" ---{}^", "-".repeat(col_nr)); // Print the next 5 lines for n in line_nr + 1..line_nr + 6 { - if n > lines.len() as u32 - 1 { + if n > lines.len() - 1 { continue; } - println!("{:<5}|{}", n + 1, lines[n as usize]); + println!("{:<5}|{}", n + 1, lines[n]); } println!(); println!(); } fn print_tokens(css: String) { - let mut it = ByteStream::new(); - it.read_from_str(&css, Some(Encoding::UTF8)); - it.close(); + let mut stream = ByteStream::new(); + stream.read_from_str(&css, Some(Encoding::UTF8)); + stream.close(); - let mut tokenizer = Tokenizer::new(&mut it, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); loop { let token = tokenizer.consume(); println!("{:?}", token); diff --git a/src/bin/gosub-parser.rs b/src/bin/gosub-parser.rs index 611002995..11be25e74 100644 --- a/src/bin/gosub-parser.rs +++ b/src/bin/gosub-parser.rs @@ -1,7 +1,7 @@ use anyhow::bail; use gosub_html5::parser::document::{Document, DocumentBuilder}; use gosub_html5::parser::Html5Parser; -use gosub_shared::bytes::{CharIterator, Confidence, Encoding}; +use gosub_shared::byte_stream::{ByteStream, Confidence, Encoding}; use gosub_shared::timing::Scale; use gosub_shared::timing_display; use gosub_shared::types::Result; @@ -50,20 +50,21 @@ fn main() -> Result<()> { bail("Invalid url scheme"); }; - let mut chars = CharIterator::new(); - chars.read_from_str(&html, Some(Encoding::UTF8)); - chars.set_confidence(Confidence::Certain); + let mut stream = ByteStream::new(); + stream.read_from_str(&html, Some(Encoding::UTF8)); + stream.set_confidence(Confidence::Certain); + stream.close(); // If the encoding confidence is not Confidence::Certain, we should detect the encoding. - if !chars.is_certain_encoding() { - chars.detect_encoding(); + if !stream.is_certain_encoding() { + stream.detect_encoding(); } // SimpleLogger::new().init().unwrap(); // Create a new document that will be filled in by the parser let handle = DocumentBuilder::new_document(Some(url)); - let parse_errors = Html5Parser::parse_document(&mut chars, Document::clone(&handle), None)?; + let parse_errors = Html5Parser::parse_document(&mut stream, Document::clone(&handle), None)?; println!("Found {} stylesheets", handle.get().stylesheets.len()); for sheet in &handle.get().stylesheets { diff --git a/src/bin/style-parser.rs b/src/bin/style-parser.rs index 31bf63f67..edf19d94d 100644 --- a/src/bin/style-parser.rs +++ b/src/bin/style-parser.rs @@ -6,7 +6,7 @@ use url::Url; use gosub_html5::parser::document::Document; use gosub_html5::parser::document::DocumentBuilder; use gosub_html5::parser::Html5Parser; -use gosub_shared::bytes::{CharIterator, Confidence, Encoding}; +use gosub_shared::byte_stream::{ByteStream, Confidence, Encoding}; // struct TextVisitor { // color: String, @@ -118,13 +118,14 @@ fn main() -> Result<()> { bail!("Unsupported url scheme: {}", url.scheme()); }; - let mut chars = CharIterator::new(); - chars.read_from_str(&html, Some(Encoding::UTF8)); - chars.set_confidence(Confidence::Certain); + let mut stream = ByteStream::new(); + stream.read_from_str(&html, Some(Encoding::UTF8)); + stream.set_confidence(Confidence::Certain); + stream.close(); let doc_handle = DocumentBuilder::new_document(Some(url)); let _parse_errors = - Html5Parser::parse_document(&mut chars, Document::clone(&doc_handle), None)?; + Html5Parser::parse_document(&mut stream, Document::clone(&doc_handle), None)?; // let _render_tree = generate_render_tree(Document::clone(&doc_handle))?; diff --git a/src/bin/test-user-agent.rs b/src/bin/test-user-agent.rs index 3db1be0f6..4cc5872e1 100644 --- a/src/bin/test-user-agent.rs +++ b/src/bin/test-user-agent.rs @@ -1,7 +1,7 @@ use gosub_html5::node::{Node, NodeData}; use gosub_html5::parser::document::DocumentBuilder; use gosub_html5::parser::{document::Document, Html5Parser}; -use gosub_shared::bytes::{CharIterator, Confidence, Encoding}; +use gosub_shared::byte_stream::{ByteStream, Confidence, Encoding}; use gosub_shared::types::Result; use std::process::exit; @@ -22,17 +22,18 @@ fn main() -> Result<()> { } let html = response.into_string()?; - let mut chars = CharIterator::new(); - chars.read_from_str(&html, Some(Encoding::UTF8)); - chars.set_confidence(Confidence::Certain); + let mut stream = ByteStream::new(); + stream.read_from_str(&html, Some(Encoding::UTF8)); + stream.set_confidence(Confidence::Certain); + stream.close(); // If the encoding confidence is not Confidence::Certain, we should detect the encoding. - if !chars.is_certain_encoding() { - chars.detect_encoding() + if !stream.is_certain_encoding() { + stream.detect_encoding() } let document = DocumentBuilder::new_document(None); - let parse_errors = Html5Parser::parse_document(&mut chars, Document::clone(&document), None)?; + let parse_errors = Html5Parser::parse_document(&mut stream, Document::clone(&document), None)?; match get_node_by_path(&document.get(), vec!["html", "body"]) { None => { diff --git a/src/engine.rs b/src/engine.rs index f260eede9..bc90589ee 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,3 +1,4 @@ +use gosub_shared::byte_stream::{ByteStream, Confidence, Encoding}; #[cfg(not(target_arch = "wasm32"))] use { cookie::CookieJar, @@ -8,7 +9,6 @@ use { dns::{Dns, ResolveType}, http::{headers::Headers, request::Request, response::Response}, }, - gosub_shared::bytes::{CharIterator, Confidence, Encoding}, gosub_shared::types::{Error, ParseError, Result}, gosub_shared::{timing_start, timing_stop}, std::io::Read, @@ -47,7 +47,11 @@ impl Debug for FetchResponse { writeln!(f, "{}", self.document)?; writeln!(f, "Parse errors:")?; for error in &self.parse_errors { - writeln!(f, " ({}:{}) {}", error.line, error.col, error.message)?; + writeln!( + f, + " ({}:{}) {}", + error.location.line, error.location.column, error.message + )?; } writeln!(f, "Render tree:")?; writeln!(f, "{}", self.render_tree)?; @@ -134,12 +138,13 @@ fn fetch_url( let t_id = timing_start!("html.parse", parts.as_str()); - let mut chars = CharIterator::new(); - let _ = chars.read_from_bytes(&fetch_response.response.body, Some(Encoding::UTF8)); - chars.set_confidence(Confidence::Certain); + let mut stream = ByteStream::new(); + let _ = stream.read_from_bytes(&fetch_response.response.body, Some(Encoding::UTF8)); + stream.set_confidence(Confidence::Certain); fetch_response.document = DocumentBuilder::new_document(Some(parts)); - match Html5Parser::parse_document(&mut chars, Document::clone(&fetch_response.document), None) { + match Html5Parser::parse_document(&mut stream, Document::clone(&fetch_response.document), None) + { Ok(parse_errors) => { fetch_response.parse_errors = parse_errors; } diff --git a/src/wasm/css.rs b/src/wasm/css.rs index da1dcdc0c..6d2b965a2 100644 --- a/src/wasm/css.rs +++ b/src/wasm/css.rs @@ -1,9 +1,8 @@ -use gosub_css3::location::Location; use gosub_css3::parser_config::ParserConfig; use gosub_css3::tokenizer::{TokenType, Tokenizer}; use gosub_css3::walker::Walker; use gosub_css3::{Css3, Error}; -use gosub_shared::byte_stream::{ByteStream, Encoding, Stream}; +use gosub_shared::byte_stream::{ByteStream, Encoding}; use wasm_bindgen::prelude::wasm_bindgen; #[wasm_bindgen] @@ -76,12 +75,8 @@ pub fn css3_parser(input: &str, opts: CssOptions) -> CssOutput { fn display_snippet(css: &str, err: Error) -> String { let loc = err.location.clone(); let lines: Vec<&str> = css.split('\n').collect(); - let line_nr = loc.line() - 1; - let col_nr = if loc.column() < 2 { - 0 - } else { - loc.column() - 2 - }; + let line_nr = loc.line - 1; + let col_nr = if loc.column < 2 { 0 } else { loc.column - 2 }; if col_nr > 1000 { return String::from("Error is too far to the right to display."); @@ -117,11 +112,11 @@ fn display_snippet(css: &str, err: Error) -> String { } fn print_tokens(css: &str) -> String { - let mut it = ByteStream::new(); - it.read_from_str(css, Some(Encoding::UTF8)); - it.close(); + let mut stream = ByteStream::new(); + stream.read_from_str(css, Some(Encoding::UTF8)); + stream.close(); - let mut tokenizer = Tokenizer::new(&mut it, Location::default()); + let mut tokenizer = Tokenizer::new(&mut stream, Location::default()); let mut out = String::new(); diff --git a/src/wasm/html.rs b/src/wasm/html.rs index c1ed2fed7..e86de6766 100644 --- a/src/wasm/html.rs +++ b/src/wasm/html.rs @@ -4,7 +4,6 @@ use url::Url; use gosub_html5::parser::document::{Document, DocumentBuilder}; use gosub_html5::parser::Html5Parser; -use gosub_shared::bytes::{CharIterator, Confidence, Encoding}; use wasm_bindgen::prelude::wasm_bindgen; #[wasm_bindgen] @@ -46,13 +45,14 @@ pub fn html_parser(input: &str, opts: HTMLOptions) -> HTMLOutput { let url = Url::parse(&opts.url).ok(); let doc = DocumentBuilder::new_document(url); - let mut chars = CharIterator::new(); - chars.read_from_str(&input, Some(Encoding::UTF8)); - chars.set_confidence(Confidence::Certain); + let mut stream = ByteStream::new(); + stream.read_from_str(&input, Some(Encoding::UTF8)); + stream.set_confidence(Confidence::Certain); + stream.close(); let mut errors = String::new(); - match Html5Parser::parse_document(&mut chars, Document::clone(&doc), None) { + match Html5Parser::parse_document(&mut stream, Document::clone(&doc), None) { Ok(errs) => { for e in errs { errors.push_str(&format!("{}@{}:{}\n", e.message, e.line, e.col)); diff --git a/src/wasm/renderer.rs b/src/wasm/renderer.rs index 2e9139992..6ff03fd06 100644 --- a/src/wasm/renderer.rs +++ b/src/wasm/renderer.rs @@ -7,7 +7,6 @@ use gosub_html5::parser::Html5Parser; use gosub_renderer::render_tree::TreeDrawer; use gosub_renderer::renderer::{Renderer, RendererOptions as GRendererOptions}; use gosub_rendering::layout::generate_taffy_tree; -use gosub_shared::bytes::{CharIterator, Confidence, Encoding}; use gosub_shared::types::Result; use gosub_styling::render_tree::generate_render_tree; use gosub_styling::render_tree::RenderTree as StyleTree; @@ -88,13 +87,14 @@ async fn renderer_internal(opts: RendererOptions) -> Result<()> { } fn load_html_rendertree(input: &str, url: Url) -> Result { - let mut chars = CharIterator::new(); - chars.read_from_str(&input, Some(Encoding::UTF8)); - chars.set_confidence(Confidence::Certain); + let mut stream = ByteStream::new(); + stream.read_from_str(&input, Some(Encoding::UTF8)); + stream.set_confidence(Confidence::Certain); + stream.close(); let doc_handle = DocumentBuilder::new_document(Some(url)); let _parse_errors = - Html5Parser::parse_document(&mut chars, Document::clone(&doc_handle), None)?; + Html5Parser::parse_document(&mut stream, Document::clone(&doc_handle), None)?; generate_render_tree(Document::clone(&doc_handle)) } diff --git a/src/wasm/styles.rs b/src/wasm/styles.rs index 5c5cdcd26..6b8fdbcfa 100644 --- a/src/wasm/styles.rs +++ b/src/wasm/styles.rs @@ -2,7 +2,6 @@ use url::Url; use gosub_html5::parser::document::{Document, DocumentBuilder}; use gosub_html5::parser::Html5Parser; -use gosub_shared::bytes::{CharIterator, Confidence, Encoding}; use gosub_styling::render_tree::generate_render_tree; use wasm_bindgen::prelude::wasm_bindgen; @@ -45,12 +44,14 @@ pub fn styles_parser(input: &str, opts: StylesOptions) -> StylesOutput { let url = Url::parse(&opts.url).ok(); let doc = DocumentBuilder::new_document(url); - let mut chars = CharIterator::new(); - chars.read_from_str(&input, Some(Encoding::UTF8)); - chars.set_confidence(Confidence::Certain); + let mut stream = ByteStream::new(); + stream.read_from_str(&input, Some(Encoding::UTF8)); + stream.set_confidence(Confidence::Certain); + stream.close(); + let mut errors = String::new(); - match Html5Parser::parse_document(&mut chars, Document::clone(&doc), None) { + match Html5Parser::parse_document(&mut stream, Document::clone(&doc), None) { Ok(errs) => { for e in errs { errors.push_str(&format!("{}@{}:{}\n", e.message, e.line, e.col));