From 8b8b506319faed0a815d60bdcde18f501120add5 Mon Sep 17 00:00:00 2001 From: Intege-rs Date: Fri, 28 Nov 2025 02:12:33 -0500 Subject: [PATCH] expose parser.rs via a hack --- sub/_macros/src/lib.rs | 3 + sub/_macros/src/patterns.rs | 487 +----------------------------------- sub/xpat/Cargo.toml | 3 + sub/xpat/src/hexdump.rs | 6 +- sub/xpat/src/lib.rs | 6 + sub/xpat/src/parser.rs | 477 +++++++++++++++++++++++++++++++++++ 6 files changed, 503 insertions(+), 479 deletions(-) create mode 100644 sub/xpat/src/parser.rs diff --git a/sub/_macros/src/lib.rs b/sub/_macros/src/lib.rs index d712b46..253dbe2 100644 --- a/sub/_macros/src/lib.rs +++ b/sub/_macros/src/lib.rs @@ -1,5 +1,8 @@ #![allow(unused)] +// xpat's parser requires this +extern crate alloc; + mod from_repr; mod patterns; diff --git a/sub/_macros/src/patterns.rs b/sub/_macros/src/patterns.rs index 4ede814..59bbbad 100644 --- a/sub/_macros/src/patterns.rs +++ b/sub/_macros/src/patterns.rs @@ -1,6 +1,15 @@ use core::{cmp, fmt, mem, str}; use proc_macro::{Literal, TokenStream, TokenTree}; + +mod atoms { + include!("../../xpat/src/atoms.rs"); +} + +mod parser { + include!("../../xpat/src/parser.rs"); +} + /// Compile time pattern parser. /// /// ```ignore @@ -14,7 +23,7 @@ pub fn proc_pattern(input: TokenStream) -> TokenStream { _e => panic!("expected a single string literal to parse, got: {_e:?}"), }; - let pattern = match parse(&string) { + let pattern = match parser::parse(&string) { Ok(pattern) => pattern, Err(err) => panic!("invalid pattern syntax: {}", err), }; @@ -52,479 +61,3 @@ fn parse_str_literal(input: &Literal) -> String { } string } - - -/// Special skip value to indicate to use platform pointer size instead. -pub(crate) const PTR_SKIP: u8 = 0; - -/// Pattern parsing error. -#[derive(Copy, Clone, Debug, Eq, PartialEq)] -pub struct ParsePatError { - kind: PatError, - position: usize, -} -impl fmt::Display for ParsePatError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Syntax Error @{}: {}.", self.position, self.kind.to_str()) - } -} - -#[derive(Copy, Clone, Debug, Eq, PartialEq)] -enum PatError { - UnpairedHexDigit, - UnknownChar, - ManyOverflow, - ManyRange, - ManyInvalid, - SaveOverflow, - StackError, - StackInvalid, - UnclosedQuote, - AlignedOperand, - CheckOperand, - ReadOperand, - SubPattern, - SubOverflow, - DoubleNibble -} -impl PatError { - fn to_str(self) -> &'static str { - match self { - PatError::UnpairedHexDigit => "unpaired hex digit", - PatError::UnknownChar => "unknown character", - PatError::ManyOverflow => "many range exceeded", - PatError::ManyRange => "many bounds nonsensical", - PatError::ManyInvalid => "many invalid syntax", - PatError::SaveOverflow => "save store overflow", - PatError::StackError => "stack unbalanced", - PatError::StackInvalid => "stack must follow jump", - PatError::UnclosedQuote => "string missing end quote", - PatError::AlignedOperand => "aligned operand error", - PatError::CheckOperand => "aligned operand error", - PatError::ReadOperand => "read operand error", - PatError::SubPattern => "sub pattern error", - PatError::SubOverflow => "sub pattern too large", - PatError::DoubleNibble => "unpaired nibble wildcard", - } - } -} - -//---------------------------------------------------------------- - -include!("../../xpat/src/atoms.rs"); - -/// Pattern parser. -/// -/// # Remarks -/// -/// Following are examples of the pattern syntax. -/// The syntax takes inspiration from [YARA hexadecimal strings](https://yara.readthedocs.io/en/v3.7.0/writingrules.html#hexadecimal-strings). -/// -/// ```text -/// 55 89 e5 83 ? ec -/// ``` -/// -/// Case insensitive hexadecimal characters match the exact byte pattern and question marks serve as placeholders for unknown bytes. -/// -/// Note that a single question mark matches a whole byte. The syntax to mask part of a byte is not yet available. -/// -/// Spaces (code point 32) are completely optional and carry no semantic meaning, their purpose is to visually group things together. -/// -/// ```text -/// b9 ' 37 13 00 00 -/// ``` -/// -/// Single quotes are used as a bookmarks, to save the current cursor rva in the save array passed to the scanner. -/// -/// It is no longer necessary to do tedious address calculations to read information out of the byte stream after a match was found. -/// This power really comes to life with the capability to follow relative and absolute references. -/// -/// The first entry in the save array is reserved for the rva where the pattern was matched. -/// The rest of the save array is filled in order of appearance of the quotes. Here the rva of the quote can be found in `save[1]`. -/// -/// ```text -/// b8 [16] 50 [13-42] ff -/// ``` -/// -/// Pairs of decimal numbers separated by a hypen in square brackets indicate the lower and upper bound of number of bytes to skip. -/// The scanner is non greedy and considers the first match while skipping as little as possible. -/// -/// A single decimal number in square brackets without hypens is a fixed size jump, equivalent to writing that number of consecutive question marks. -/// -/// ```text -/// 31 c0 74 % ' c3 -/// e8 $ ' 31 c0 c3 -/// 68 * ' 31 c0 c3 -/// ``` -/// -/// These symbols are used to follow; a signed 1 byte relative jump: `%`, a signed 4 byte relative jump: `$` and an absolute pointer: `*`. -/// -/// They are designed to be able to have the scanner follow short jumps, calls and longer jumps, and absolute pointers. -/// -/// Composes really well with bookmarks to find the addresses of referenced functions and other data without tedious address calculations. -/// -/// ```text -/// b8 * "STRING" 00 -/// ``` -/// -/// String literals appear in double quotes and will be matched as UTF-8. -/// -/// Escape sequences are not supported, switch back to matching with hex digits as needed. -/// For UTF-16 support, you are welcome to send a PR. -/// -/// ```text -/// e8 $ { ' } 83 f0 5c c3 -/// ``` -/// -/// Curly braces must follow a jump symbol (see above). -/// -/// The sub pattern enclosed within the curly braces is matched at the destination after following the jump. -/// After the pattern successfully matched, the cursor returns to before the jump was followed. -/// The bytes defining the jump are skipped and matching continues again from here. -/// -/// ```text -/// e8 $ @4 -/// ``` -/// -/// Checks that the cursor is aligned at this point in the scan. -/// The align value is `(1 << arg)`, in this example the cursor is checked to be aligned to 16. -/// -/// ```text -/// e8 i1 a0 u4 -/// ``` -/// -/// An `i` or `u` indicates memory read operations followed by the size of the operand to read. -/// -/// The read values are stored in the save array alongside the bookmarked addresses (single quotes). -/// This means the values are sign- or zero- extended respectively before being stored. -/// Operand sizes are 1 (byte), 2 (word) or 4 (dword). -/// -/// The cursor is advanced by the size of the operand. -/// -/// ```text -/// 83 c0 2a ( 6a ? | 68 ? ? ? ? ) e8 -/// ``` -/// -/// Parentheses indicate alternate subpatterns separated by a pipe character. -/// -/// The scanner attempts to match the alternate subpatterns from left to right and fails if none of them match. -pub fn parse(pat: &str) -> Result, ParsePatError> { - let mut result = Vec::with_capacity(pat.len() / 2); - let mut pat_end = pat; - match parse_helper(&mut pat_end, &mut result) { - Ok(()) => Ok(result), - Err(kind) => { - let position = pat_end.as_ptr() as usize - pat.as_ptr() as usize; - Err(ParsePatError { kind, position }) - }, - } -} -// This is preferable but currently limited by macro rules... -// pub use crate::pattern as parse; -fn parse_helper(pat: &mut &str, result: &mut Vec) -> Result<(), PatError> { - result.push(Atom::Save(0)); - let mut iter = pat.as_bytes().iter(); - let mut save = 1; - let mut depth = 0; - #[derive(Default)] - struct SubPattern { - case: usize, - brks: Vec, - save: u8, - save_next: u8, - depth: u8, - } - let mut subs = Vec::::new(); - while let Some(mut chr) = iter.next().cloned() { - match chr { - // Follow signed 1 byte jump - b'%' => result.push(Atom::Jump1), - // Follow signed 4 byte jump - b'$' => result.push(Atom::Jump4), - // Follow pointer - b'*' => result.push(Atom::Ptr), - // Start recursive operator - b'{' => { - depth += 1; - // Must follow a jump operator and insert push before the jump - let atom = match result.last_mut() { - Some(atom @ Atom::Jump1) => mem::replace(atom, Atom::Push(1)), - Some(atom @ Atom::Jump4) => mem::replace(atom, Atom::Push(4)), - Some(atom @ Atom::Ptr) => mem::replace(atom, Atom::Push(PTR_SKIP)), - _ => return Err(PatError::StackInvalid), - }; - result.push(atom); - }, - // End recursive operator - b'}' => { - // Unbalanced recursion - if depth <= 0 { - return Err(PatError::StackError); - } - depth -= 1; - result.push(Atom::Pop); - }, - // Start subpattern - b'(' => { - subs.push(SubPattern::default()); - let sub = subs.last_mut().unwrap(); - // Keep the save and depth state - sub.save = save; - sub.depth = depth; - // Add a new case, update the case offset later - sub.case = result.len(); - result.push(Atom::Case(0)); - }, - // Case subpattern - b'|' => { - // Should already have started a subpattern - let sub = subs.last_mut().ok_or(PatError::SubPattern)?; - // Update the save state - sub.save_next = cmp::max(sub.save_next, save); - save = sub.save; - depth = sub.depth; - // Add a break of the previous subpattern - sub.brks.push(result.len()); - result.push(Atom::Break(0)); - // Add a new case of the next subpattern - let case_offset = result.len() - sub.case - 1; - if case_offset >= 256 { - return Err(PatError::SubOverflow); - } - result[sub.case] = Atom::Case(case_offset as u8); - sub.case = result.len(); - result.push(Atom::Case(0)); - }, - // End subpattern - b')' => { - // Should already have started a subpattern - let sub = subs.pop().ok_or(PatError::SubPattern)?; - // Prepare for the next save - save = cmp::max(sub.save_next, save); - depth = sub.depth; - // Neutralize the last case, since there are no more - result[sub.case] = Atom::Nop; - // Fill in the breaks - for &brk in &sub.brks { - let brk_offset = result.len() - brk - 1; - if brk_offset >= 256 { - return Err(PatError::SubOverflow); - } - result[brk] = Atom::Break(brk_offset as u8); - } - }, - // Skip many operator - b'[' => { - // Parse the lower bound - let mut lower_bound = 0u32; - let mut at_least_one_char = false; - loop { - chr = iter.next().cloned().ok_or(PatError::ManyInvalid)?; - match chr { - b'-' | b']' => break, - chr @ b'0'..=b'9' => { - at_least_one_char = true; - lower_bound = lower_bound * 10 + (chr - b'0') as u32; - if lower_bound >= 16384 { - return Err(PatError::ManyOverflow); - } - }, - _ => return Err(PatError::ManyInvalid), - } - } - if !at_least_one_char { - return Err(PatError::ManyInvalid); - } - // Turn the lower bound into skip ops - if lower_bound > 0 { - if lower_bound >= 256 { - result.push(Atom::Rangext((lower_bound >> 8) as u8)); - } - result.push(Atom::Skip((lower_bound & 0xff) as u8)); - } - // Second many part is optional - if chr == b']' { - continue; - } - // Parse the upper bound - let mut upper_bound = 0u32; - loop { - chr = iter.next().cloned().ok_or(PatError::ManyInvalid)?; - match chr { - b']' => break, - chr @ b'0'..=b'9' => { - upper_bound = upper_bound * 10 + (chr - b'0') as u32; - if upper_bound >= 16384 { - return Err(PatError::ManyOverflow); - } - }, - _ => return Err(PatError::ManyInvalid), - } - } - // Lower bound should be strictly less than the upper bound - if lower_bound < upper_bound { - let many_skip = upper_bound - lower_bound; - if many_skip >= 256 { - result.push(Atom::Rangext((many_skip >> 8) as u8)); - } - result.push(Atom::Many((many_skip & 0xff) as u8)); - } - else { - return Err(PatError::ManyRange); - } - }, - // Match a byte - b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' | b'.' => { - let mut mask = 0xFF; - - // High nibble of the byte - let hi = if chr == b'.' { mask &= 0x0F;0 } - else if chr >= b'a' { chr - b'a' + 10 } - else if chr >= b'A' { chr - b'A' + 10 } - else { chr - b'0' }; - - chr = iter.next().cloned().ok_or(PatError::UnpairedHexDigit)?; - // Low nibble of the byte - let lo = if chr >= b'a' && chr <= b'f' { chr - b'a' + 10 } - else if chr >= b'A' && chr <= b'F' { chr - b'A' + 10 } - else if chr >= b'0' && chr <= b'9' { chr - b'0' } - else if chr == b'.' { mask &= 0xF0; 0 } - else { return Err(PatError::UnpairedHexDigit); }; - - if mask == 0 { return Err(PatError::DoubleNibble); }; - - // mask out nibble - if mask != 0xFF { result.push(Atom::Fuzzy(mask)) } - - // Add byte to the pattern - result.push(Atom::Byte((hi << 4) + lo)); - }, - // Match raw bytes - b'"' => { - loop { - if let Some(chr) = iter.next().cloned() { - if chr != b'"' { - result.push(Atom::Byte(chr)); - } - else { - break; - } - } - else { - return Err(PatError::UnclosedQuote); - } - } - }, - // Save the cursor - b'\'' => { - // 'Limited' save space - if save >= u8::MAX { - return Err(PatError::SaveOverflow); - } - result.push(Atom::Save(save)); - save += 1; - }, - // Skip bytes - b'?' => { - // match result.last_mut() { - // Some(Atom::Skip(skip)) if *skip != PTR_SKIP && *skip < 127i8 => *skip += 1, - // _ => result.push(Atom::Skip(1)), - // }; - // Coalescence skips together - if let Some(Atom::Skip(skip)) = result.last_mut() { - if *skip != PTR_SKIP && *skip < 255u8 { - *skip += 1; - continue; - } - } - result.push(Atom::Skip(1)); - }, - - b'=' => { - let op = iter.next().cloned().ok_or(PatError::CheckOperand)?; - result.push( match op { - b'0'..=b'9' => Atom::Check(op - b'0'), - b'A'..=b'Z' => Atom::Check(10 + (op - b'A')), - b'a'..=b'z' => Atom::Check(10 + (op - b'a')), - _ => return Err(PatError::CheckOperand) - }); - }, - b'@' => { - let op = iter.next().cloned().ok_or(PatError::AlignedOperand)?; - result.push( match op { - b'0'..=b'9' => Atom::Aligned(op - b'0'), - b'A'..=b'Z' => Atom::Aligned(10 + (op - b'A')), - b'a'..=b'z' => Atom::Aligned(10 + (op - b'a')), - _ => return Err(PatError::AlignedOperand) - }); - }, - b'i' => { - let atom = match iter.next().cloned() { - Some(b'1') => Atom::ReadI8(save), - Some(b'2') => Atom::ReadI16(save), - Some(b'4') => Atom::ReadI32(save), - _ => return Err(PatError::ReadOperand), - }; - if save >= u8::MAX { - return Err(PatError::SaveOverflow); - } - save += 1; - result.push(atom); - }, - b'u' => { - let atom = match iter.next().cloned() { - Some(b'1') => Atom::ReadU8(save), - Some(b'2') => Atom::ReadU16(save), - Some(b'4') => Atom::ReadU32(save), - _ => return Err(PatError::ReadOperand), - }; - if save >= u8::MAX { - return Err(PatError::SaveOverflow); - } - save += 1; - result.push(atom); - }, - b'z' => { - if save >= u8::MAX { - return Err(PatError::SaveOverflow); - } - result.push(Atom::Zero(save)); - save += 1; - }, - - - // Allow spaces as padding - b' ' | b'\n' | b'\r' | b'\t' => {}, - // Everything else is illegal - _ => { - return Err(PatError::UnknownChar); - }, - } - // Converted from str originally, should be safe - *pat = unsafe { str::from_utf8_unchecked(iter.as_slice()) }; - } - // Check balanced stack operators - if depth != 0 { - return Err(PatError::StackError); - } - // Check if sub patterns are balanced - if subs.len() != 0 { - return Err(PatError::SubPattern); - } - - // Remove redundant atoms at the end - fn is_redundant(atom: &Atom) -> bool { - match atom { - | Atom::Skip(_) - | Atom::Rangext(_) - | Atom::Pop - | Atom::Many(_) => true, - _ => false, - } - } - while result.last().map(is_redundant).unwrap_or(false) { - result.pop(); - } - - Ok(()) -} \ No newline at end of file diff --git a/sub/xpat/Cargo.toml b/sub/xpat/Cargo.toml index 31457b4..cbc804e 100644 --- a/sub/xpat/Cargo.toml +++ b/sub/xpat/Cargo.toml @@ -3,6 +3,9 @@ name = "sub_xpat" version = "0.1.0" edition = "2021" +[features] +alloc = [] + [dependencies] sub_core.workspace = true sub_macros.workspace = true \ No newline at end of file diff --git a/sub/xpat/src/hexdump.rs b/sub/xpat/src/hexdump.rs index e37381d..946ec63 100644 --- a/sub/xpat/src/hexdump.rs +++ b/sub/xpat/src/hexdump.rs @@ -6,13 +6,15 @@ const SEP: &str = " | "; pub struct HexDump<'s, T: Scannable + ?Sized, R: RangeBounds>(pub &'s T, pub R); +#[allow(clippy::needless_lifetimes)] pub fn hex< + 'a, T: Scannable + ?Sized, R: RangeBounds >( - data: &T, + data: &'a T, range:R -) -> HexDump { +) -> HexDump<'a, T, R> { HexDump(data, range) } diff --git a/sub/xpat/src/lib.rs b/sub/xpat/src/lib.rs index 03a4ae0..642134d 100644 --- a/sub/xpat/src/lib.rs +++ b/sub/xpat/src/lib.rs @@ -8,6 +8,12 @@ pub mod scannable; pub mod scanner; pub mod hexdump; +#[cfg(feature = "alloc")] +extern crate alloc; + +#[cfg(feature = "alloc")] +pub mod parser; + // // Export Preludes: // diff --git a/sub/xpat/src/parser.rs b/sub/xpat/src/parser.rs new file mode 100644 index 0000000..2706600 --- /dev/null +++ b/sub/xpat/src/parser.rs @@ -0,0 +1,477 @@ +use core::{cmp, fmt, mem, str}; +use super::atoms::Atom; +use alloc::vec::Vec; + +/// Special skip value to indicate to use platform pointer size instead. +pub(crate) const PTR_SKIP: u8 = 0; + +/// Pattern parsing error. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct ParsePatError { + kind: PatError, + position: usize, +} + +impl fmt::Display for ParsePatError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Syntax Error @{}: {}.", self.position, self.kind.to_str()) + } +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum PatError { + UnpairedHexDigit, + UnknownChar, + ManyOverflow, + ManyRange, + ManyInvalid, + SaveOverflow, + StackError, + StackInvalid, + UnclosedQuote, + AlignedOperand, + CheckOperand, + ReadOperand, + SubPattern, + SubOverflow, + DoubleNibble +} +impl PatError { + fn to_str(self) -> &'static str { + match self { + PatError::UnpairedHexDigit => "unpaired hex digit", + PatError::UnknownChar => "unknown character", + PatError::ManyOverflow => "many range exceeded", + PatError::ManyRange => "many bounds nonsensical", + PatError::ManyInvalid => "many invalid syntax", + PatError::SaveOverflow => "save store overflow", + PatError::StackError => "stack unbalanced", + PatError::StackInvalid => "stack must follow jump", + PatError::UnclosedQuote => "string missing end quote", + PatError::AlignedOperand => "aligned operand error", + PatError::CheckOperand => "aligned operand error", + PatError::ReadOperand => "read operand error", + PatError::SubPattern => "sub pattern error", + PatError::SubOverflow => "sub pattern too large", + PatError::DoubleNibble => "unpaired nibble wildcard", + } + } +} + +//---------------------------------------------------------------- + +/// Pattern parser. +/// +/// # Remarks +/// +/// Following are examples of the pattern syntax. +/// The syntax takes inspiration from [YARA hexadecimal strings](https://yara.readthedocs.io/en/v3.7.0/writingrules.html#hexadecimal-strings). +/// +/// ```text +/// 55 89 e5 83 ? ec +/// ``` +/// +/// Case insensitive hexadecimal characters match the exact byte pattern and question marks serve as placeholders for unknown bytes. +/// +/// Note that a single question mark matches a whole byte. The syntax to mask part of a byte is not yet available. +/// +/// Spaces (code point 32) are completely optional and carry no semantic meaning, their purpose is to visually group things together. +/// +/// ```text +/// b9 ' 37 13 00 00 +/// ``` +/// +/// Single quotes are used as a bookmarks, to save the current cursor rva in the save array passed to the scanner. +/// +/// It is no longer necessary to do tedious address calculations to read information out of the byte stream after a match was found. +/// This power really comes to life with the capability to follow relative and absolute references. +/// +/// The first entry in the save array is reserved for the rva where the pattern was matched. +/// The rest of the save array is filled in order of appearance of the quotes. Here the rva of the quote can be found in `save[1]`. +/// +/// ```text +/// b8 [16] 50 [13-42] ff +/// ``` +/// +/// Pairs of decimal numbers separated by a hypen in square brackets indicate the lower and upper bound of number of bytes to skip. +/// The scanner is non greedy and considers the first match while skipping as little as possible. +/// +/// A single decimal number in square brackets without hypens is a fixed size jump, equivalent to writing that number of consecutive question marks. +/// +/// ```text +/// 31 c0 74 % ' c3 +/// e8 $ ' 31 c0 c3 +/// 68 * ' 31 c0 c3 +/// ``` +/// +/// These symbols are used to follow; a signed 1 byte relative jump: `%`, a signed 4 byte relative jump: `$` and an absolute pointer: `*`. +/// +/// They are designed to be able to have the scanner follow short jumps, calls and longer jumps, and absolute pointers. +/// +/// Composes really well with bookmarks to find the addresses of referenced functions and other data without tedious address calculations. +/// +/// ```text +/// b8 * "STRING" 00 +/// ``` +/// +/// String literals appear in double quotes and will be matched as UTF-8. +/// +/// Escape sequences are not supported, switch back to matching with hex digits as needed. +/// For UTF-16 support, you are welcome to send a PR. +/// +/// ```text +/// e8 $ { ' } 83 f0 5c c3 +/// ``` +/// +/// Curly braces must follow a jump symbol (see above). +/// +/// The sub pattern enclosed within the curly braces is matched at the destination after following the jump. +/// After the pattern successfully matched, the cursor returns to before the jump was followed. +/// The bytes defining the jump are skipped and matching continues again from here. +/// +/// ```text +/// e8 $ @4 +/// ``` +/// +/// Checks that the cursor is aligned at this point in the scan. +/// The align value is `(1 << arg)`, in this example the cursor is checked to be aligned to 16. +/// +/// ```text +/// e8 i1 a0 u4 +/// ``` +/// +/// An `i` or `u` indicates memory read operations followed by the size of the operand to read. +/// +/// The read values are stored in the save array alongside the bookmarked addresses (single quotes). +/// This means the values are sign- or zero- extended respectively before being stored. +/// Operand sizes are 1 (byte), 2 (word) or 4 (dword). +/// +/// The cursor is advanced by the size of the operand. +/// +/// ```text +/// 83 c0 2a ( 6a ? | 68 ? ? ? ? ) e8 +/// ``` +/// +/// Parentheses indicate alternate subpatterns separated by a pipe character. +/// +/// The scanner attempts to match the alternate subpatterns from left to right and fails if none of them match. +pub fn parse(pat: &str) -> Result, ParsePatError> { + let mut result = Vec::with_capacity(pat.len() / 2); + let mut pat_end = pat; + match parse_helper(&mut pat_end, &mut result) { + Ok(()) => Ok(result), + Err(kind) => { + let position = pat_end.as_ptr() as usize - pat.as_ptr() as usize; + Err(ParsePatError { kind, position }) + }, + } +} +// This is preferable but currently limited by macro rules... +// pub use crate::pattern as parse; +fn parse_helper(pat: &mut &str, result: &mut Vec) -> Result<(), PatError> { + result.push(Atom::Save(0)); + let mut iter = pat.as_bytes().iter(); + let mut save = 1; + let mut depth = 0; + #[derive(Default)] + struct SubPattern { + case: usize, + brks: Vec, + save: u8, + save_next: u8, + depth: u8, + } + let mut subs = Vec::::new(); + while let Some(mut chr) = iter.next().cloned() { + match chr { + // Follow signed 1 byte jump + b'%' => result.push(Atom::Jump1), + // Follow signed 4 byte jump + b'$' => result.push(Atom::Jump4), + // Follow pointer + b'*' => result.push(Atom::Ptr), + // Start recursive operator + b'{' => { + depth += 1; + // Must follow a jump operator and insert push before the jump + let atom = match result.last_mut() { + Some(atom @ Atom::Jump1) => mem::replace(atom, Atom::Push(1)), + Some(atom @ Atom::Jump4) => mem::replace(atom, Atom::Push(4)), + Some(atom @ Atom::Ptr) => mem::replace(atom, Atom::Push(PTR_SKIP)), + _ => return Err(PatError::StackInvalid), + }; + result.push(atom); + }, + // End recursive operator + b'}' => { + // Unbalanced recursion + if depth <= 0 { + return Err(PatError::StackError); + } + depth -= 1; + result.push(Atom::Pop); + }, + // Start subpattern + b'(' => { + subs.push(SubPattern::default()); + let sub = subs.last_mut().unwrap(); + // Keep the save and depth state + sub.save = save; + sub.depth = depth; + // Add a new case, update the case offset later + sub.case = result.len(); + result.push(Atom::Case(0)); + }, + // Case subpattern + b'|' => { + // Should already have started a subpattern + let sub = subs.last_mut().ok_or(PatError::SubPattern)?; + // Update the save state + sub.save_next = cmp::max(sub.save_next, save); + save = sub.save; + depth = sub.depth; + // Add a break of the previous subpattern + sub.brks.push(result.len()); + result.push(Atom::Break(0)); + // Add a new case of the next subpattern + let case_offset = result.len() - sub.case - 1; + if case_offset >= 256 { + return Err(PatError::SubOverflow); + } + result[sub.case] = Atom::Case(case_offset as u8); + sub.case = result.len(); + result.push(Atom::Case(0)); + }, + // End subpattern + b')' => { + // Should already have started a subpattern + let sub = subs.pop().ok_or(PatError::SubPattern)?; + // Prepare for the next save + save = cmp::max(sub.save_next, save); + depth = sub.depth; + // Neutralize the last case, since there are no more + result[sub.case] = Atom::Nop; + // Fill in the breaks + for &brk in &sub.brks { + let brk_offset = result.len() - brk - 1; + if brk_offset >= 256 { + return Err(PatError::SubOverflow); + } + result[brk] = Atom::Break(brk_offset as u8); + } + }, + // Skip many operator + b'[' => { + // Parse the lower bound + let mut lower_bound = 0u32; + let mut at_least_one_char = false; + loop { + chr = iter.next().cloned().ok_or(PatError::ManyInvalid)?; + match chr { + b'-' | b']' => break, + chr @ b'0'..=b'9' => { + at_least_one_char = true; + lower_bound = lower_bound * 10 + (chr - b'0') as u32; + if lower_bound >= 16384 { + return Err(PatError::ManyOverflow); + } + }, + _ => return Err(PatError::ManyInvalid), + } + } + if !at_least_one_char { + return Err(PatError::ManyInvalid); + } + // Turn the lower bound into skip ops + if lower_bound > 0 { + if lower_bound >= 256 { + result.push(Atom::Rangext((lower_bound >> 8) as u8)); + } + result.push(Atom::Skip((lower_bound & 0xff) as u8)); + } + // Second many part is optional + if chr == b']' { + continue; + } + // Parse the upper bound + let mut upper_bound = 0u32; + loop { + chr = iter.next().cloned().ok_or(PatError::ManyInvalid)?; + match chr { + b']' => break, + chr @ b'0'..=b'9' => { + upper_bound = upper_bound * 10 + (chr - b'0') as u32; + if upper_bound >= 16384 { + return Err(PatError::ManyOverflow); + } + }, + _ => return Err(PatError::ManyInvalid), + } + } + // Lower bound should be strictly less than the upper bound + if lower_bound < upper_bound { + let many_skip = upper_bound - lower_bound; + if many_skip >= 256 { + result.push(Atom::Rangext((many_skip >> 8) as u8)); + } + result.push(Atom::Many((many_skip & 0xff) as u8)); + } + else { + return Err(PatError::ManyRange); + } + }, + // Match a byte + b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' | b'.' => { + let mut mask = 0xFF; + + // High nibble of the byte + let hi = if chr == b'.' { mask &= 0x0F;0 } + else if chr >= b'a' { chr - b'a' + 10 } + else if chr >= b'A' { chr - b'A' + 10 } + else { chr - b'0' }; + + chr = iter.next().cloned().ok_or(PatError::UnpairedHexDigit)?; + // Low nibble of the byte + let lo = if chr >= b'a' && chr <= b'f' { chr - b'a' + 10 } + else if chr >= b'A' && chr <= b'F' { chr - b'A' + 10 } + else if chr >= b'0' && chr <= b'9' { chr - b'0' } + else if chr == b'.' { mask &= 0xF0; 0 } + else { return Err(PatError::UnpairedHexDigit); }; + + if mask == 0 { return Err(PatError::DoubleNibble); }; + + // mask out nibble + if mask != 0xFF { result.push(Atom::Fuzzy(mask)) } + + // Add byte to the pattern + result.push(Atom::Byte((hi << 4) + lo)); + }, + // Match raw bytes + b'"' => { + loop { + if let Some(chr) = iter.next().cloned() { + if chr != b'"' { + result.push(Atom::Byte(chr)); + } + else { + break; + } + } + else { + return Err(PatError::UnclosedQuote); + } + } + }, + // Save the cursor + b'\'' => { + // 'Limited' save space + if save >= u8::MAX { + return Err(PatError::SaveOverflow); + } + result.push(Atom::Save(save)); + save += 1; + }, + // Skip bytes + b'?' => { + // match result.last_mut() { + // Some(Atom::Skip(skip)) if *skip != PTR_SKIP && *skip < 127i8 => *skip += 1, + // _ => result.push(Atom::Skip(1)), + // }; + // Coalescence skips together + if let Some(Atom::Skip(skip)) = result.last_mut() { + if *skip != PTR_SKIP && *skip < 255u8 { + *skip += 1; + continue; + } + } + result.push(Atom::Skip(1)); + }, + + b'=' => { + let op = iter.next().cloned().ok_or(PatError::CheckOperand)?; + result.push( match op { + b'0'..=b'9' => Atom::Check(op - b'0'), + b'A'..=b'Z' => Atom::Check(10 + (op - b'A')), + b'a'..=b'z' => Atom::Check(10 + (op - b'a')), + _ => return Err(PatError::CheckOperand) + }); + }, + b'@' => { + let op = iter.next().cloned().ok_or(PatError::AlignedOperand)?; + result.push( match op { + b'0'..=b'9' => Atom::Aligned(op - b'0'), + b'A'..=b'Z' => Atom::Aligned(10 + (op - b'A')), + b'a'..=b'z' => Atom::Aligned(10 + (op - b'a')), + _ => return Err(PatError::AlignedOperand) + }); + }, + b'i' => { + let atom = match iter.next().cloned() { + Some(b'1') => Atom::ReadI8(save), + Some(b'2') => Atom::ReadI16(save), + Some(b'4') => Atom::ReadI32(save), + _ => return Err(PatError::ReadOperand), + }; + if save >= u8::MAX { + return Err(PatError::SaveOverflow); + } + save += 1; + result.push(atom); + }, + b'u' => { + let atom = match iter.next().cloned() { + Some(b'1') => Atom::ReadU8(save), + Some(b'2') => Atom::ReadU16(save), + Some(b'4') => Atom::ReadU32(save), + _ => return Err(PatError::ReadOperand), + }; + if save >= u8::MAX { + return Err(PatError::SaveOverflow); + } + save += 1; + result.push(atom); + }, + b'z' => { + if save >= u8::MAX { + return Err(PatError::SaveOverflow); + } + result.push(Atom::Zero(save)); + save += 1; + }, + + + // Allow spaces as padding + b' ' | b'\n' | b'\r' | b'\t' => {}, + // Everything else is illegal + _ => { + return Err(PatError::UnknownChar); + }, + } + // Converted from str originally, should be safe + *pat = unsafe { str::from_utf8_unchecked(iter.as_slice()) }; + } + // Check balanced stack operators + if depth != 0 { + return Err(PatError::StackError); + } + // Check if sub patterns are balanced + if subs.len() != 0 { + return Err(PatError::SubPattern); + } + + // Remove redundant atoms at the end + fn is_redundant(atom: &Atom) -> bool { + match atom { + | Atom::Skip(_) + | Atom::Rangext(_) + | Atom::Pop + | Atom::Many(_) => true, + _ => false, + } + } + while result.last().map(is_redundant).unwrap_or(false) { + result.pop(); + } + + Ok(()) +} \ No newline at end of file