expose parser.rs via a hack
This commit is contained in:
@@ -1,6 +1,15 @@
|
||||
use core::{cmp, fmt, mem, str};
|
||||
use proc_macro::{Literal, TokenStream, TokenTree};
|
||||
|
||||
|
||||
mod atoms {
|
||||
include!("../../xpat/src/atoms.rs");
|
||||
}
|
||||
|
||||
mod parser {
|
||||
include!("../../xpat/src/parser.rs");
|
||||
}
|
||||
|
||||
/// Compile time pattern parser.
|
||||
///
|
||||
/// ```ignore
|
||||
@@ -14,7 +23,7 @@ pub fn proc_pattern(input: TokenStream) -> TokenStream {
|
||||
_e => panic!("expected a single string literal to parse, got: {_e:?}"),
|
||||
};
|
||||
|
||||
let pattern = match parse(&string) {
|
||||
let pattern = match parser::parse(&string) {
|
||||
Ok(pattern) => pattern,
|
||||
Err(err) => panic!("invalid pattern syntax: {}", err),
|
||||
};
|
||||
@@ -52,479 +61,3 @@ fn parse_str_literal(input: &Literal) -> String {
|
||||
}
|
||||
string
|
||||
}
|
||||
|
||||
|
||||
/// Special skip value to indicate to use platform pointer size instead.
|
||||
pub(crate) const PTR_SKIP: u8 = 0;
|
||||
|
||||
/// Pattern parsing error.
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
|
||||
pub struct ParsePatError {
|
||||
kind: PatError,
|
||||
position: usize,
|
||||
}
|
||||
impl fmt::Display for ParsePatError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "Syntax Error @{}: {}.", self.position, self.kind.to_str())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
|
||||
enum PatError {
|
||||
UnpairedHexDigit,
|
||||
UnknownChar,
|
||||
ManyOverflow,
|
||||
ManyRange,
|
||||
ManyInvalid,
|
||||
SaveOverflow,
|
||||
StackError,
|
||||
StackInvalid,
|
||||
UnclosedQuote,
|
||||
AlignedOperand,
|
||||
CheckOperand,
|
||||
ReadOperand,
|
||||
SubPattern,
|
||||
SubOverflow,
|
||||
DoubleNibble
|
||||
}
|
||||
impl PatError {
|
||||
fn to_str(self) -> &'static str {
|
||||
match self {
|
||||
PatError::UnpairedHexDigit => "unpaired hex digit",
|
||||
PatError::UnknownChar => "unknown character",
|
||||
PatError::ManyOverflow => "many range exceeded",
|
||||
PatError::ManyRange => "many bounds nonsensical",
|
||||
PatError::ManyInvalid => "many invalid syntax",
|
||||
PatError::SaveOverflow => "save store overflow",
|
||||
PatError::StackError => "stack unbalanced",
|
||||
PatError::StackInvalid => "stack must follow jump",
|
||||
PatError::UnclosedQuote => "string missing end quote",
|
||||
PatError::AlignedOperand => "aligned operand error",
|
||||
PatError::CheckOperand => "aligned operand error",
|
||||
PatError::ReadOperand => "read operand error",
|
||||
PatError::SubPattern => "sub pattern error",
|
||||
PatError::SubOverflow => "sub pattern too large",
|
||||
PatError::DoubleNibble => "unpaired nibble wildcard",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
||||
include!("../../xpat/src/atoms.rs");
|
||||
|
||||
/// Pattern parser.
|
||||
///
|
||||
/// # Remarks
|
||||
///
|
||||
/// Following are examples of the pattern syntax.
|
||||
/// The syntax takes inspiration from [YARA hexadecimal strings](https://yara.readthedocs.io/en/v3.7.0/writingrules.html#hexadecimal-strings).
|
||||
///
|
||||
/// ```text
|
||||
/// 55 89 e5 83 ? ec
|
||||
/// ```
|
||||
///
|
||||
/// Case insensitive hexadecimal characters match the exact byte pattern and question marks serve as placeholders for unknown bytes.
|
||||
///
|
||||
/// Note that a single question mark matches a whole byte. The syntax to mask part of a byte is not yet available.
|
||||
///
|
||||
/// Spaces (code point 32) are completely optional and carry no semantic meaning, their purpose is to visually group things together.
|
||||
///
|
||||
/// ```text
|
||||
/// b9 ' 37 13 00 00
|
||||
/// ```
|
||||
///
|
||||
/// Single quotes are used as a bookmarks, to save the current cursor rva in the save array passed to the scanner.
|
||||
///
|
||||
/// It is no longer necessary to do tedious address calculations to read information out of the byte stream after a match was found.
|
||||
/// This power really comes to life with the capability to follow relative and absolute references.
|
||||
///
|
||||
/// The first entry in the save array is reserved for the rva where the pattern was matched.
|
||||
/// The rest of the save array is filled in order of appearance of the quotes. Here the rva of the quote can be found in `save[1]`.
|
||||
///
|
||||
/// ```text
|
||||
/// b8 [16] 50 [13-42] ff
|
||||
/// ```
|
||||
///
|
||||
/// Pairs of decimal numbers separated by a hypen in square brackets indicate the lower and upper bound of number of bytes to skip.
|
||||
/// The scanner is non greedy and considers the first match while skipping as little as possible.
|
||||
///
|
||||
/// A single decimal number in square brackets without hypens is a fixed size jump, equivalent to writing that number of consecutive question marks.
|
||||
///
|
||||
/// ```text
|
||||
/// 31 c0 74 % ' c3
|
||||
/// e8 $ ' 31 c0 c3
|
||||
/// 68 * ' 31 c0 c3
|
||||
/// ```
|
||||
///
|
||||
/// These symbols are used to follow; a signed 1 byte relative jump: `%`, a signed 4 byte relative jump: `$` and an absolute pointer: `*`.
|
||||
///
|
||||
/// They are designed to be able to have the scanner follow short jumps, calls and longer jumps, and absolute pointers.
|
||||
///
|
||||
/// Composes really well with bookmarks to find the addresses of referenced functions and other data without tedious address calculations.
|
||||
///
|
||||
/// ```text
|
||||
/// b8 * "STRING" 00
|
||||
/// ```
|
||||
///
|
||||
/// String literals appear in double quotes and will be matched as UTF-8.
|
||||
///
|
||||
/// Escape sequences are not supported, switch back to matching with hex digits as needed.
|
||||
/// For UTF-16 support, you are welcome to send a PR.
|
||||
///
|
||||
/// ```text
|
||||
/// e8 $ { ' } 83 f0 5c c3
|
||||
/// ```
|
||||
///
|
||||
/// Curly braces must follow a jump symbol (see above).
|
||||
///
|
||||
/// The sub pattern enclosed within the curly braces is matched at the destination after following the jump.
|
||||
/// After the pattern successfully matched, the cursor returns to before the jump was followed.
|
||||
/// The bytes defining the jump are skipped and matching continues again from here.
|
||||
///
|
||||
/// ```text
|
||||
/// e8 $ @4
|
||||
/// ```
|
||||
///
|
||||
/// Checks that the cursor is aligned at this point in the scan.
|
||||
/// The align value is `(1 << arg)`, in this example the cursor is checked to be aligned to 16.
|
||||
///
|
||||
/// ```text
|
||||
/// e8 i1 a0 u4
|
||||
/// ```
|
||||
///
|
||||
/// An `i` or `u` indicates memory read operations followed by the size of the operand to read.
|
||||
///
|
||||
/// The read values are stored in the save array alongside the bookmarked addresses (single quotes).
|
||||
/// This means the values are sign- or zero- extended respectively before being stored.
|
||||
/// Operand sizes are 1 (byte), 2 (word) or 4 (dword).
|
||||
///
|
||||
/// The cursor is advanced by the size of the operand.
|
||||
///
|
||||
/// ```text
|
||||
/// 83 c0 2a ( 6a ? | 68 ? ? ? ? ) e8
|
||||
/// ```
|
||||
///
|
||||
/// Parentheses indicate alternate subpatterns separated by a pipe character.
|
||||
///
|
||||
/// The scanner attempts to match the alternate subpatterns from left to right and fails if none of them match.
|
||||
pub fn parse(pat: &str) -> Result<Vec<Atom>, ParsePatError> {
|
||||
let mut result = Vec::with_capacity(pat.len() / 2);
|
||||
let mut pat_end = pat;
|
||||
match parse_helper(&mut pat_end, &mut result) {
|
||||
Ok(()) => Ok(result),
|
||||
Err(kind) => {
|
||||
let position = pat_end.as_ptr() as usize - pat.as_ptr() as usize;
|
||||
Err(ParsePatError { kind, position })
|
||||
},
|
||||
}
|
||||
}
|
||||
// This is preferable but currently limited by macro rules...
|
||||
// pub use crate::pattern as parse;
|
||||
fn parse_helper(pat: &mut &str, result: &mut Vec<Atom>) -> Result<(), PatError> {
|
||||
result.push(Atom::Save(0));
|
||||
let mut iter = pat.as_bytes().iter();
|
||||
let mut save = 1;
|
||||
let mut depth = 0;
|
||||
#[derive(Default)]
|
||||
struct SubPattern {
|
||||
case: usize,
|
||||
brks: Vec<usize>,
|
||||
save: u8,
|
||||
save_next: u8,
|
||||
depth: u8,
|
||||
}
|
||||
let mut subs = Vec::<SubPattern>::new();
|
||||
while let Some(mut chr) = iter.next().cloned() {
|
||||
match chr {
|
||||
// Follow signed 1 byte jump
|
||||
b'%' => result.push(Atom::Jump1),
|
||||
// Follow signed 4 byte jump
|
||||
b'$' => result.push(Atom::Jump4),
|
||||
// Follow pointer
|
||||
b'*' => result.push(Atom::Ptr),
|
||||
// Start recursive operator
|
||||
b'{' => {
|
||||
depth += 1;
|
||||
// Must follow a jump operator and insert push before the jump
|
||||
let atom = match result.last_mut() {
|
||||
Some(atom @ Atom::Jump1) => mem::replace(atom, Atom::Push(1)),
|
||||
Some(atom @ Atom::Jump4) => mem::replace(atom, Atom::Push(4)),
|
||||
Some(atom @ Atom::Ptr) => mem::replace(atom, Atom::Push(PTR_SKIP)),
|
||||
_ => return Err(PatError::StackInvalid),
|
||||
};
|
||||
result.push(atom);
|
||||
},
|
||||
// End recursive operator
|
||||
b'}' => {
|
||||
// Unbalanced recursion
|
||||
if depth <= 0 {
|
||||
return Err(PatError::StackError);
|
||||
}
|
||||
depth -= 1;
|
||||
result.push(Atom::Pop);
|
||||
},
|
||||
// Start subpattern
|
||||
b'(' => {
|
||||
subs.push(SubPattern::default());
|
||||
let sub = subs.last_mut().unwrap();
|
||||
// Keep the save and depth state
|
||||
sub.save = save;
|
||||
sub.depth = depth;
|
||||
// Add a new case, update the case offset later
|
||||
sub.case = result.len();
|
||||
result.push(Atom::Case(0));
|
||||
},
|
||||
// Case subpattern
|
||||
b'|' => {
|
||||
// Should already have started a subpattern
|
||||
let sub = subs.last_mut().ok_or(PatError::SubPattern)?;
|
||||
// Update the save state
|
||||
sub.save_next = cmp::max(sub.save_next, save);
|
||||
save = sub.save;
|
||||
depth = sub.depth;
|
||||
// Add a break of the previous subpattern
|
||||
sub.brks.push(result.len());
|
||||
result.push(Atom::Break(0));
|
||||
// Add a new case of the next subpattern
|
||||
let case_offset = result.len() - sub.case - 1;
|
||||
if case_offset >= 256 {
|
||||
return Err(PatError::SubOverflow);
|
||||
}
|
||||
result[sub.case] = Atom::Case(case_offset as u8);
|
||||
sub.case = result.len();
|
||||
result.push(Atom::Case(0));
|
||||
},
|
||||
// End subpattern
|
||||
b')' => {
|
||||
// Should already have started a subpattern
|
||||
let sub = subs.pop().ok_or(PatError::SubPattern)?;
|
||||
// Prepare for the next save
|
||||
save = cmp::max(sub.save_next, save);
|
||||
depth = sub.depth;
|
||||
// Neutralize the last case, since there are no more
|
||||
result[sub.case] = Atom::Nop;
|
||||
// Fill in the breaks
|
||||
for &brk in &sub.brks {
|
||||
let brk_offset = result.len() - brk - 1;
|
||||
if brk_offset >= 256 {
|
||||
return Err(PatError::SubOverflow);
|
||||
}
|
||||
result[brk] = Atom::Break(brk_offset as u8);
|
||||
}
|
||||
},
|
||||
// Skip many operator
|
||||
b'[' => {
|
||||
// Parse the lower bound
|
||||
let mut lower_bound = 0u32;
|
||||
let mut at_least_one_char = false;
|
||||
loop {
|
||||
chr = iter.next().cloned().ok_or(PatError::ManyInvalid)?;
|
||||
match chr {
|
||||
b'-' | b']' => break,
|
||||
chr @ b'0'..=b'9' => {
|
||||
at_least_one_char = true;
|
||||
lower_bound = lower_bound * 10 + (chr - b'0') as u32;
|
||||
if lower_bound >= 16384 {
|
||||
return Err(PatError::ManyOverflow);
|
||||
}
|
||||
},
|
||||
_ => return Err(PatError::ManyInvalid),
|
||||
}
|
||||
}
|
||||
if !at_least_one_char {
|
||||
return Err(PatError::ManyInvalid);
|
||||
}
|
||||
// Turn the lower bound into skip ops
|
||||
if lower_bound > 0 {
|
||||
if lower_bound >= 256 {
|
||||
result.push(Atom::Rangext((lower_bound >> 8) as u8));
|
||||
}
|
||||
result.push(Atom::Skip((lower_bound & 0xff) as u8));
|
||||
}
|
||||
// Second many part is optional
|
||||
if chr == b']' {
|
||||
continue;
|
||||
}
|
||||
// Parse the upper bound
|
||||
let mut upper_bound = 0u32;
|
||||
loop {
|
||||
chr = iter.next().cloned().ok_or(PatError::ManyInvalid)?;
|
||||
match chr {
|
||||
b']' => break,
|
||||
chr @ b'0'..=b'9' => {
|
||||
upper_bound = upper_bound * 10 + (chr - b'0') as u32;
|
||||
if upper_bound >= 16384 {
|
||||
return Err(PatError::ManyOverflow);
|
||||
}
|
||||
},
|
||||
_ => return Err(PatError::ManyInvalid),
|
||||
}
|
||||
}
|
||||
// Lower bound should be strictly less than the upper bound
|
||||
if lower_bound < upper_bound {
|
||||
let many_skip = upper_bound - lower_bound;
|
||||
if many_skip >= 256 {
|
||||
result.push(Atom::Rangext((many_skip >> 8) as u8));
|
||||
}
|
||||
result.push(Atom::Many((many_skip & 0xff) as u8));
|
||||
}
|
||||
else {
|
||||
return Err(PatError::ManyRange);
|
||||
}
|
||||
},
|
||||
// Match a byte
|
||||
b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' | b'.' => {
|
||||
let mut mask = 0xFF;
|
||||
|
||||
// High nibble of the byte
|
||||
let hi = if chr == b'.' { mask &= 0x0F;0 }
|
||||
else if chr >= b'a' { chr - b'a' + 10 }
|
||||
else if chr >= b'A' { chr - b'A' + 10 }
|
||||
else { chr - b'0' };
|
||||
|
||||
chr = iter.next().cloned().ok_or(PatError::UnpairedHexDigit)?;
|
||||
// Low nibble of the byte
|
||||
let lo = if chr >= b'a' && chr <= b'f' { chr - b'a' + 10 }
|
||||
else if chr >= b'A' && chr <= b'F' { chr - b'A' + 10 }
|
||||
else if chr >= b'0' && chr <= b'9' { chr - b'0' }
|
||||
else if chr == b'.' { mask &= 0xF0; 0 }
|
||||
else { return Err(PatError::UnpairedHexDigit); };
|
||||
|
||||
if mask == 0 { return Err(PatError::DoubleNibble); };
|
||||
|
||||
// mask out nibble
|
||||
if mask != 0xFF { result.push(Atom::Fuzzy(mask)) }
|
||||
|
||||
// Add byte to the pattern
|
||||
result.push(Atom::Byte((hi << 4) + lo));
|
||||
},
|
||||
// Match raw bytes
|
||||
b'"' => {
|
||||
loop {
|
||||
if let Some(chr) = iter.next().cloned() {
|
||||
if chr != b'"' {
|
||||
result.push(Atom::Byte(chr));
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
return Err(PatError::UnclosedQuote);
|
||||
}
|
||||
}
|
||||
},
|
||||
// Save the cursor
|
||||
b'\'' => {
|
||||
// 'Limited' save space
|
||||
if save >= u8::MAX {
|
||||
return Err(PatError::SaveOverflow);
|
||||
}
|
||||
result.push(Atom::Save(save));
|
||||
save += 1;
|
||||
},
|
||||
// Skip bytes
|
||||
b'?' => {
|
||||
// match result.last_mut() {
|
||||
// Some(Atom::Skip(skip)) if *skip != PTR_SKIP && *skip < 127i8 => *skip += 1,
|
||||
// _ => result.push(Atom::Skip(1)),
|
||||
// };
|
||||
// Coalescence skips together
|
||||
if let Some(Atom::Skip(skip)) = result.last_mut() {
|
||||
if *skip != PTR_SKIP && *skip < 255u8 {
|
||||
*skip += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
result.push(Atom::Skip(1));
|
||||
},
|
||||
|
||||
b'=' => {
|
||||
let op = iter.next().cloned().ok_or(PatError::CheckOperand)?;
|
||||
result.push( match op {
|
||||
b'0'..=b'9' => Atom::Check(op - b'0'),
|
||||
b'A'..=b'Z' => Atom::Check(10 + (op - b'A')),
|
||||
b'a'..=b'z' => Atom::Check(10 + (op - b'a')),
|
||||
_ => return Err(PatError::CheckOperand)
|
||||
});
|
||||
},
|
||||
b'@' => {
|
||||
let op = iter.next().cloned().ok_or(PatError::AlignedOperand)?;
|
||||
result.push( match op {
|
||||
b'0'..=b'9' => Atom::Aligned(op - b'0'),
|
||||
b'A'..=b'Z' => Atom::Aligned(10 + (op - b'A')),
|
||||
b'a'..=b'z' => Atom::Aligned(10 + (op - b'a')),
|
||||
_ => return Err(PatError::AlignedOperand)
|
||||
});
|
||||
},
|
||||
b'i' => {
|
||||
let atom = match iter.next().cloned() {
|
||||
Some(b'1') => Atom::ReadI8(save),
|
||||
Some(b'2') => Atom::ReadI16(save),
|
||||
Some(b'4') => Atom::ReadI32(save),
|
||||
_ => return Err(PatError::ReadOperand),
|
||||
};
|
||||
if save >= u8::MAX {
|
||||
return Err(PatError::SaveOverflow);
|
||||
}
|
||||
save += 1;
|
||||
result.push(atom);
|
||||
},
|
||||
b'u' => {
|
||||
let atom = match iter.next().cloned() {
|
||||
Some(b'1') => Atom::ReadU8(save),
|
||||
Some(b'2') => Atom::ReadU16(save),
|
||||
Some(b'4') => Atom::ReadU32(save),
|
||||
_ => return Err(PatError::ReadOperand),
|
||||
};
|
||||
if save >= u8::MAX {
|
||||
return Err(PatError::SaveOverflow);
|
||||
}
|
||||
save += 1;
|
||||
result.push(atom);
|
||||
},
|
||||
b'z' => {
|
||||
if save >= u8::MAX {
|
||||
return Err(PatError::SaveOverflow);
|
||||
}
|
||||
result.push(Atom::Zero(save));
|
||||
save += 1;
|
||||
},
|
||||
|
||||
|
||||
// Allow spaces as padding
|
||||
b' ' | b'\n' | b'\r' | b'\t' => {},
|
||||
// Everything else is illegal
|
||||
_ => {
|
||||
return Err(PatError::UnknownChar);
|
||||
},
|
||||
}
|
||||
// Converted from str originally, should be safe
|
||||
*pat = unsafe { str::from_utf8_unchecked(iter.as_slice()) };
|
||||
}
|
||||
// Check balanced stack operators
|
||||
if depth != 0 {
|
||||
return Err(PatError::StackError);
|
||||
}
|
||||
// Check if sub patterns are balanced
|
||||
if subs.len() != 0 {
|
||||
return Err(PatError::SubPattern);
|
||||
}
|
||||
|
||||
// Remove redundant atoms at the end
|
||||
fn is_redundant(atom: &Atom) -> bool {
|
||||
match atom {
|
||||
| Atom::Skip(_)
|
||||
| Atom::Rangext(_)
|
||||
| Atom::Pop
|
||||
| Atom::Many(_) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
while result.last().map(is_redundant).unwrap_or(false) {
|
||||
result.pop();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user