From 964edd1260e1545b2115e96abd5638c6aabaed75 Mon Sep 17 00:00:00 2001 From: Yuji Sugiura Date: Wed, 2 Oct 2024 10:08:41 +0900 Subject: [PATCH] refactor(regular_expression): Misc fixes --- crates/oxc_regular_expression/src/ast.rs | 26 ++- .../src/{ => ast_impl}/display.rs | 12 +- .../src/ast_impl/mod.rs | 2 + .../src/{ => ast_impl}/visit.rs | 3 + .../src/body_parser/parser.rs | 196 +++++++++++------- .../src/body_parser/state.rs | 2 +- .../src/{flag_parser.rs => flags_parser.rs} | 2 +- crates/oxc_regular_expression/src/lib.rs | 11 +- .../src/literal_parser.rs | 4 +- .../src/{span.rs => span_factory.rs} | 0 10 files changed, 159 insertions(+), 99 deletions(-) rename crates/oxc_regular_expression/src/{ => ast_impl}/display.rs (97%) create mode 100644 crates/oxc_regular_expression/src/ast_impl/mod.rs rename crates/oxc_regular_expression/src/{ => ast_impl}/visit.rs (98%) rename crates/oxc_regular_expression/src/{flag_parser.rs => flags_parser.rs} (96%) rename crates/oxc_regular_expression/src/{span.rs => span_factory.rs} (100%) diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs index eb5bf97d3ebf4..3644246113889 100644 --- a/crates/oxc_regular_expression/src/ast.rs +++ b/crates/oxc_regular_expression/src/ast.rs @@ -1,7 +1,3 @@ -// NB: `#[span]`, `#[scope(...)]`,`#[visit(...)]` and `#[generate_derive(...)]` do NOT do anything to the code. -// They are purely markers for codegen used in `tasks/ast_tools` and `crates/oxc_traverse/scripts`. See docs in those crates. -// Read [`macro@oxc_ast_macros::ast`] for more information. - // Silence erroneous warnings from Rust Analyser for `#[derive(Tsify)]` #![allow(non_snake_case)] @@ -76,19 +72,19 @@ pub struct Alternative<'a> { #[cfg_attr(feature = "serialize", derive(Serialize, Tsify))] pub enum Term<'a> { // Assertion, QuantifiableAssertion - BoundaryAssertion(BoundaryAssertion) = 0, + BoundaryAssertion(Box<'a, BoundaryAssertion>) = 0, LookAroundAssertion(Box<'a, LookAroundAssertion<'a>>) = 1, // Quantifier Quantifier(Box<'a, Quantifier<'a>>) = 2, // Atom, ExtendedAtom - Character(Character) = 3, + Character(Box<'a, Character>) = 3, Dot(Dot) = 4, - CharacterClassEscape(CharacterClassEscape) = 5, + CharacterClassEscape(Box<'a, CharacterClassEscape>) = 5, UnicodePropertyEscape(Box<'a, UnicodePropertyEscape<'a>>) = 6, CharacterClass(Box<'a, CharacterClass<'a>>) = 7, CapturingGroup(Box<'a, CapturingGroup<'a>>) = 8, IgnoreGroup(Box<'a, IgnoreGroup<'a>>) = 9, - IndexedReference(IndexedReference) = 10, + IndexedReference(Box<'a, IndexedReference>) = 10, NamedReference(Box<'a, NamedReference<'a>>) = 11, } @@ -286,9 +282,9 @@ pub enum CharacterClassContentsKind { #[cfg_attr(feature = "serialize", derive(Serialize, Tsify))] pub enum CharacterClassContents<'a> { CharacterClassRange(Box<'a, CharacterClassRange>) = 0, - CharacterClassEscape(CharacterClassEscape) = 1, + CharacterClassEscape(Box<'a, CharacterClassEscape>) = 1, UnicodePropertyEscape(Box<'a, UnicodePropertyEscape<'a>>) = 2, - Character(Character) = 3, + Character(Box<'a, Character>) = 3, /// `UnicodeSetsMode` only NestedCharacterClass(Box<'a, CharacterClass<'a>>) = 4, /// `UnicodeSetsMode` only @@ -404,3 +400,13 @@ pub struct NamedReference<'a> { pub span: Span, pub name: Atom<'a>, } + +// See `oxc_ast/src/lib.rs` for the details +#[cfg(target_pointer_width = "64")] +#[test] +fn size_asserts() { + use std::mem::size_of; + + assert!(size_of::() == 16); + assert!(size_of::() == 16); +} diff --git a/crates/oxc_regular_expression/src/display.rs b/crates/oxc_regular_expression/src/ast_impl/display.rs similarity index 97% rename from crates/oxc_regular_expression/src/display.rs rename to crates/oxc_regular_expression/src/ast_impl/display.rs index ce790dc4524f6..939de35b3be31 100644 --- a/crates/oxc_regular_expression/src/display.rs +++ b/crates/oxc_regular_expression/src/ast_impl/display.rs @@ -78,17 +78,17 @@ impl<'a> Display for Alternative<'a> { impl<'a> Display for Term<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::BoundaryAssertion(it) => write!(f, "{it}"), + Self::BoundaryAssertion(it) => write!(f, "{}", it.as_ref()), Self::LookAroundAssertion(it) => write!(f, "{}", it.as_ref()), Self::Quantifier(it) => write!(f, "{}", it.as_ref()), - Self::Character(it) => write!(f, "{it}"), + Self::Character(it) => write!(f, "{}", it.as_ref()), Self::Dot(it) => write!(f, "{it}"), - Self::CharacterClassEscape(it) => write!(f, "{it}"), + Self::CharacterClassEscape(it) => write!(f, "{}", it.as_ref()), Self::UnicodePropertyEscape(it) => write!(f, "{}", it.as_ref()), Self::CharacterClass(it) => write!(f, "{}", it.as_ref()), Self::CapturingGroup(it) => write!(f, "{}", it.as_ref()), Self::IgnoreGroup(it) => write!(f, "{}", it.as_ref()), - Self::IndexedReference(it) => write!(f, "{it}"), + Self::IndexedReference(it) => write!(f, "{}", it.as_ref()), Self::NamedReference(it) => write!(f, "{}", it.as_ref()), } } @@ -246,9 +246,9 @@ impl<'a> Display for CharacterClassContents<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::CharacterClassRange(it) => write!(f, "{}", it.as_ref()), - Self::CharacterClassEscape(it) => write!(f, "{it}"), + Self::CharacterClassEscape(it) => write!(f, "{}", it.as_ref()), Self::UnicodePropertyEscape(it) => write!(f, "{}", it.as_ref()), - Self::Character(it) => write!(f, "{it}"), + Self::Character(it) => write!(f, "{}", it.as_ref()), Self::NestedCharacterClass(it) => write!(f, "{}", it.as_ref()), Self::ClassStringDisjunction(it) => write!(f, "{}", it.as_ref()), } diff --git a/crates/oxc_regular_expression/src/ast_impl/mod.rs b/crates/oxc_regular_expression/src/ast_impl/mod.rs new file mode 100644 index 0000000000000..fda7eef43f68e --- /dev/null +++ b/crates/oxc_regular_expression/src/ast_impl/mod.rs @@ -0,0 +1,2 @@ +mod display; +pub mod visit; diff --git a/crates/oxc_regular_expression/src/visit.rs b/crates/oxc_regular_expression/src/ast_impl/visit.rs similarity index 98% rename from crates/oxc_regular_expression/src/visit.rs rename to crates/oxc_regular_expression/src/ast_impl/visit.rs index 2e987de3ce362..88b9ff4caf79a 100644 --- a/crates/oxc_regular_expression/src/visit.rs +++ b/crates/oxc_regular_expression/src/ast_impl/visit.rs @@ -1,3 +1,6 @@ +// NOTE: For now, this file is implemented by hand for convenience. +// But like `oxc_ast`, this should be generated by `tasks/ast_tools` in the future. + #![allow(unused_variables, clippy::wildcard_imports)] use oxc_span::{GetSpan, Span}; use walk::walk_pattern; diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs index 6e68b6788ed72..9b687f358b5db 100644 --- a/crates/oxc_regular_expression/src/body_parser/parser.rs +++ b/crates/oxc_regular_expression/src/body_parser/parser.rs @@ -7,7 +7,7 @@ use crate::{ body_parser::{reader::Reader, state::State, unicode, unicode_property}, diagnostics, options::ParserOptions, - span::SpanFactory, + span_factory::SpanFactory, surrogate_pair, }; @@ -255,10 +255,13 @@ impl<'a> PatternParser<'a> { }; if let Some(kind) = kind { - return Ok(Some(ast::Term::BoundaryAssertion(ast::BoundaryAssertion { - span: self.span_factory.create(span_start, self.reader.offset()), - kind, - }))); + return Ok(Some(ast::Term::BoundaryAssertion(Box::new_in( + ast::BoundaryAssertion { + span: self.span_factory.create(span_start, self.reader.offset()), + kind, + }, + self.allocator, + )))); } let kind = if self.reader.eat3('(', '?', '=') { @@ -312,11 +315,14 @@ impl<'a> PatternParser<'a> { if let Some(cp) = self.reader.peek().filter(|&cp| !unicode::is_syntax_character(cp)) { self.reader.advance(); - return Ok(Some(ast::Term::Character(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Symbol, - value: cp, - }))); + return Ok(Some(ast::Term::Character(Box::new_in( + ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: cp, + }, + self.allocator, + )))); } // . @@ -387,11 +393,14 @@ impl<'a> PatternParser<'a> { // \ [lookahead = c] if self.reader.peek().filter(|&cp| cp == 'c' as u32).is_some() { - return Ok(Some(ast::Term::Character(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Symbol, - value: '\\' as u32, - }))); + return Ok(Some(ast::Term::Character(Box::new_in( + ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: '\\' as u32, + }, + self.allocator, + )))); } return Err(diagnostics::invalid_extended_atom_escape( @@ -434,11 +443,14 @@ impl<'a> PatternParser<'a> { // ExtendedPatternCharacter if let Some(cp) = self.consume_extended_pattern_character() { - return Ok(Some(ast::Term::Character(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Symbol, - value: cp, - }))); + return Ok(Some(ast::Term::Character(Box::new_in( + ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: cp, + }, + self.allocator, + )))); } Ok(None) @@ -467,17 +479,23 @@ impl<'a> PatternParser<'a> { )); } - return Ok(Some(ast::Term::IndexedReference(ast::IndexedReference { - span: self.span_factory.create(span_start, self.reader.offset()), - index, - }))); + return Ok(Some(ast::Term::IndexedReference(Box::new_in( + ast::IndexedReference { + span: self.span_factory.create(span_start, self.reader.offset()), + index, + }, + self.allocator, + )))); } if index <= self.state.num_of_capturing_groups { - return Ok(Some(ast::Term::IndexedReference(ast::IndexedReference { - span: self.span_factory.create(span_start, self.reader.offset()), - index, - }))); + return Ok(Some(ast::Term::IndexedReference(Box::new_in( + ast::IndexedReference { + span: self.span_factory.create(span_start, self.reader.offset()), + index, + }, + self.allocator, + )))); } self.reader.rewind(checkpoint); @@ -485,7 +503,10 @@ impl<'a> PatternParser<'a> { // CharacterClassEscape: \d, \p{...} if let Some(character_class_escape) = self.parse_character_class_escape(span_start) { - return Ok(Some(ast::Term::CharacterClassEscape(character_class_escape))); + return Ok(Some(ast::Term::CharacterClassEscape(Box::new_in( + character_class_escape, + self.allocator, + )))); } if let Some(unicode_property_escape) = self.parse_character_class_escape_unicode(span_start)? @@ -498,7 +519,7 @@ impl<'a> PatternParser<'a> { // CharacterEscape: \n, \cM, \0, etc... if let Some(character_escape) = self.parse_character_escape(span_start)? { - return Ok(Some(ast::Term::Character(character_escape))); + return Ok(Some(ast::Term::Character(Box::new_in(character_escape, self.allocator)))); } // k GroupName: \k means named reference @@ -820,11 +841,14 @@ impl<'a> PatternParser<'a> { continue; } - let dash = ast::CharacterClassContents::Character(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Symbol, - value: '-' as u32, - }); + let dash = ast::CharacterClassContents::Character(Box::new_in( + ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: '-' as u32, + }, + self.allocator, + )); let Some(class_atom_to) = self.parse_class_atom()? else { // ClassAtom[?UnicodeMode] NonemptyClassRangesNoDash[?UnicodeMode] @@ -855,8 +879,8 @@ impl<'a> PatternParser<'a> { body.push(ast::CharacterClassContents::CharacterClassRange(Box::new_in( ast::CharacterClassRange { span: from.span.merge(&to.span), - min: *from, - max: *to, + min: **from, + max: **to, }, self.allocator, ))); @@ -895,11 +919,14 @@ impl<'a> PatternParser<'a> { let span_start = self.reader.offset(); if self.reader.eat('-') { - return Ok(Some(ast::CharacterClassContents::Character(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Symbol, - value: '-' as u32, - }))); + return Ok(Some(ast::CharacterClassContents::Character(Box::new_in( + ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: '-' as u32, + }, + self.allocator, + )))); } self.parse_class_atom_no_dash() @@ -922,20 +949,26 @@ impl<'a> PatternParser<'a> { { self.reader.advance(); - return Ok(Some(ast::CharacterClassContents::Character(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Symbol, - value: cp, - }))); + return Ok(Some(ast::CharacterClassContents::Character(Box::new_in( + ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: cp, + }, + self.allocator, + )))); } if self.reader.eat('\\') { if self.reader.peek().filter(|&cp| cp == 'c' as u32).is_some() { - return Ok(Some(ast::CharacterClassContents::Character(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Symbol, - value: '\\' as u32, - }))); + return Ok(Some(ast::CharacterClassContents::Character(Box::new_in( + ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::Symbol, + value: '\\' as u32, + }, + self.allocator, + )))); } if let Some(class_escape) = self.parse_class_escape(span_start)? { @@ -969,20 +1002,26 @@ impl<'a> PatternParser<'a> { ) -> Result>> { // b if self.reader.eat('b') { - return Ok(Some(ast::CharacterClassContents::Character(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::SingleEscape, - value: 0x08, - }))); + return Ok(Some(ast::CharacterClassContents::Character(Box::new_in( + ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::SingleEscape, + value: 0x08, + }, + self.allocator, + )))); } // [+UnicodeMode] - if self.state.unicode_mode && self.reader.eat('-') { - return Ok(Some(ast::CharacterClassContents::Character(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::SingleEscape, - value: '-' as u32, - }))); + return Ok(Some(ast::CharacterClassContents::Character(Box::new_in( + ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::SingleEscape, + value: '-' as u32, + }, + self.allocator, + )))); } // [~UnicodeMode] c ClassControlLetter @@ -997,11 +1036,14 @@ impl<'a> PatternParser<'a> { { self.reader.advance(); - return Ok(Some(ast::CharacterClassContents::Character(ast::Character { - span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::ControlLetter, - value: cp, - }))); + return Ok(Some(ast::CharacterClassContents::Character(Box::new_in( + ast::Character { + span: self.span_factory.create(span_start, self.reader.offset()), + kind: ast::CharacterKind::ControlLetter, + value: cp, + }, + self.allocator, + )))); } self.reader.rewind(checkpoint); @@ -1010,9 +1052,10 @@ impl<'a> PatternParser<'a> { // CharacterClassEscape[?UnicodeMode] if let Some(character_class_escape) = self.parse_character_class_escape(span_start) { - return Ok(Some(ast::CharacterClassContents::CharacterClassEscape( + return Ok(Some(ast::CharacterClassContents::CharacterClassEscape(Box::new_in( character_class_escape, - ))); + self.allocator, + )))); } if let Some(unicode_property_escape) = self.parse_character_class_escape_unicode(span_start)? @@ -1025,7 +1068,10 @@ impl<'a> PatternParser<'a> { // CharacterEscape[?UnicodeMode, ?NamedCaptureGroups] if let Some(character_escape) = self.parse_character_escape(span_start)? { - return Ok(Some(ast::CharacterClassContents::Character(character_escape))); + return Ok(Some(ast::CharacterClassContents::Character(Box::new_in( + character_escape, + self.allocator, + )))); } Ok(None) @@ -1246,7 +1292,10 @@ impl<'a> PatternParser<'a> { } if let Some(class_set_character) = self.parse_class_set_character()? { - return Ok(Some(ast::CharacterClassContents::Character(class_set_character))); + return Ok(Some(ast::CharacterClassContents::Character(Box::new_in( + class_set_character, + self.allocator, + )))); } Ok(None) @@ -1301,9 +1350,10 @@ impl<'a> PatternParser<'a> { let checkpoint = self.reader.checkpoint(); if self.reader.eat('\\') { if let Some(character_class_escape) = self.parse_character_class_escape(span_start) { - return Ok(Some(ast::CharacterClassContents::CharacterClassEscape( + return Ok(Some(ast::CharacterClassContents::CharacterClassEscape(Box::new_in( character_class_escape, - ))); + self.allocator, + )))); } if let Some(unicode_property_escape) = self.parse_character_class_escape_unicode(span_start)? diff --git a/crates/oxc_regular_expression/src/body_parser/state.rs b/crates/oxc_regular_expression/src/body_parser/state.rs index 109c5acc3adf5..1ddcd181b8e04 100644 --- a/crates/oxc_regular_expression/src/body_parser/state.rs +++ b/crates/oxc_regular_expression/src/body_parser/state.rs @@ -1,6 +1,6 @@ use rustc_hash::FxHashSet; -use super::reader::Reader; +use crate::body_parser::reader::Reader; /// Currently all of properties are read only from outside of this module. /// Even inside of this module, it is not changed after initialized. diff --git a/crates/oxc_regular_expression/src/flag_parser.rs b/crates/oxc_regular_expression/src/flags_parser.rs similarity index 96% rename from crates/oxc_regular_expression/src/flag_parser.rs rename to crates/oxc_regular_expression/src/flags_parser.rs index 1c4b059ed32fc..6d6e7cd09b9bf 100644 --- a/crates/oxc_regular_expression/src/flag_parser.rs +++ b/crates/oxc_regular_expression/src/flags_parser.rs @@ -2,7 +2,7 @@ use oxc_allocator::Allocator; use oxc_diagnostics::Result; use rustc_hash::FxHashSet; -use crate::{ast, diagnostics, options::ParserOptions, span::SpanFactory}; +use crate::{ast, diagnostics, options::ParserOptions, span_factory::SpanFactory}; pub struct FlagsParser<'a> { source_text: &'a str, diff --git a/crates/oxc_regular_expression/src/lib.rs b/crates/oxc_regular_expression/src/lib.rs index ee697a9f02b76..4bfce9caf4d49 100644 --- a/crates/oxc_regular_expression/src/lib.rs +++ b/crates/oxc_regular_expression/src/lib.rs @@ -1,15 +1,13 @@ #![allow(clippy::missing_errors_doc)] -pub mod ast; +mod ast_impl; mod body_parser; mod diagnostics; -mod display; -mod flag_parser; +mod flags_parser; mod literal_parser; mod options; -mod span; +mod span_factory; mod surrogate_pair; -pub mod visit; mod generated { mod derive_clone_in; @@ -17,7 +15,8 @@ mod generated { mod derive_content_hash; } +pub mod ast; pub use crate::{ - body_parser::PatternParser, flag_parser::FlagsParser, literal_parser::Parser, + ast_impl::visit, body_parser::PatternParser, flags_parser::FlagsParser, literal_parser::Parser, options::ParserOptions, }; diff --git a/crates/oxc_regular_expression/src/literal_parser.rs b/crates/oxc_regular_expression/src/literal_parser.rs index 56a9e0eb1991b..56b7a2b10111d 100644 --- a/crates/oxc_regular_expression/src/literal_parser.rs +++ b/crates/oxc_regular_expression/src/literal_parser.rs @@ -2,8 +2,8 @@ use oxc_allocator::Allocator; use oxc_diagnostics::Result; use crate::{ - ast, body_parser::PatternParser, diagnostics, flag_parser::FlagsParser, options::ParserOptions, - span::SpanFactory, + ast, body_parser::PatternParser, diagnostics, flags_parser::FlagsParser, + options::ParserOptions, span_factory::SpanFactory, }; /// LiteralParser diff --git a/crates/oxc_regular_expression/src/span.rs b/crates/oxc_regular_expression/src/span_factory.rs similarity index 100% rename from crates/oxc_regular_expression/src/span.rs rename to crates/oxc_regular_expression/src/span_factory.rs