wahgex_core/
compile.rs

1//! This module is responsible for compiling a Thompson NFA (Non-deterministic
2//! Finite Automaton) into a WebAssembly module.
3
4use input::{InputFunctions, InputLayout};
5use matching::MatchingFunctions;
6use state::{StateFunctions, StateLayout};
7
8pub use crate::error::BuildError;
9
10use self::context::CompileContext;
11
12mod context;
13mod epsilon_closure;
14pub mod input;
15mod instructions;
16mod lookaround;
17mod matching;
18mod pattern;
19mod sparse_set;
20mod state;
21mod transition;
22
23/// Compiles a given Thompson NFA into a [`CompiledRegex`] WebAssembly module,
24/// using the provided configuration.
25pub fn compile_from_nfa(
26    nfa: regex_automata::nfa::thompson::NFA,
27    config: super::Config,
28) -> Result<CompiledRegex, BuildError> {
29    let mut ctx = CompileContext::new(nfa, config);
30    let state_layout = StateLayout::new(&mut ctx)?;
31    let state_funcs = StateFunctions::new(&mut ctx, &state_layout)?;
32    let input_layout = InputLayout::new(&mut ctx)?;
33    let input_funcs =
34        InputFunctions::new(&mut ctx, &input_layout, state_funcs.pattern.lookup_start);
35    let _matching_funcs = MatchingFunctions::new(
36        &mut ctx,
37        &state_layout,
38        &state_funcs,
39        &input_layout,
40        &input_funcs,
41    );
42    let module: wasm_encoder::Module = ctx.compile(&state_layout.overall);
43
44    Ok(CompiledRegex {
45        wasm_bytes: module.finish(),
46    })
47}
48
49/// Represents a regular expression that has been compiled into WebAssembly
50/// bytes.
51#[derive(Debug)]
52pub struct CompiledRegex {
53    wasm_bytes: Vec<u8>,
54}
55
56impl AsRef<[u8]> for CompiledRegex {
57    fn as_ref(&self) -> &[u8] {
58        &self.wasm_bytes
59    }
60}
61
62#[cfg(test)]
63mod tests {
64    use super::*;
65
66    pub fn setup_interpreter(
67        module_bytes: impl AsRef<[u8]>,
68    ) -> (
69        wasmi::Engine,
70        wasmi::Module,
71        wasmi::Store<()>,
72        wasmi::Instance,
73    ) {
74        let engine = wasmi::Engine::default();
75        let module = wasmi::Module::new(&engine, module_bytes).unwrap();
76        let mut store = wasmi::Store::new(&engine, ());
77        let linker = wasmi::Linker::<()>::new(&engine);
78        let instance = linker
79            .instantiate(&mut store, &module)
80            .unwrap()
81            .start(&mut store)
82            .unwrap();
83
84        (engine, module, store, instance)
85    }
86
87    #[track_caller]
88    pub fn wasm_print_module(module_bytes: impl AsRef<[u8]>) -> String {
89        let module_bytes = module_bytes.as_ref();
90        let wasm_text = wasmprinter::print_bytes(module_bytes);
91        if let Err(err) = wasmparser::validate(module_bytes) {
92            let mut wasm_text_with_offsets = String::new();
93            let print = wasmprinter::Config::new().print_offsets(true).print(
94                module_bytes,
95                &mut wasmprinter::PrintFmtWrite(&mut wasm_text_with_offsets),
96            );
97
98            match print {
99                Ok(()) => {
100                    panic!("{err}:\n{wasm_text_with_offsets}")
101                },
102                Err(print_err) => panic!("{err}:\nUnable to print WAT: {print_err}"),
103            }
104        }
105        wasm_text.expect("should be able to print WASM module in WAT format")
106    }
107
108    /// A test helper function that compiles a regex pattern string into a
109    /// [`CompiledRegex`].
110    fn compile(pattern: &str) -> Result<CompiledRegex, Box<dyn std::error::Error>> {
111        let nfa = regex_automata::nfa::thompson::NFA::new(pattern)?;
112
113        Ok(compile_from_nfa(nfa, crate::Config::new())?)
114    }
115
116    #[test]
117    fn empty_regex() {
118        let compiled = compile("").unwrap();
119        let pretty = wasm_print_module(&compiled);
120        insta::assert_snapshot!(pretty);
121    }
122
123    #[test]
124    fn simple_repetition() {
125        let compiled = compile("(?:abc)+").unwrap();
126        let pretty = wasm_print_module(&compiled);
127        insta::assert_snapshot!(pretty);
128    }
129
130    #[test]
131    fn sparse_transitions() {
132        let compiled = compile("a|b|d|e|g").unwrap();
133        let pretty = wasm_print_module(&compiled);
134        insta::assert_snapshot!(pretty);
135    }
136
137    #[test]
138    fn simple_lookaround() {
139        let compiled = compile("^hell worm$").unwrap();
140        let pretty = wasm_print_module(&compiled);
141        insta::assert_snapshot!(pretty);
142    }
143
144    #[test]
145    fn repeated_lookaround() {
146        let compiled = compile("(?:^|$)+").unwrap();
147        let pretty = wasm_print_module(&compiled);
148        insta::assert_snapshot!(pretty);
149    }
150
151    #[test]
152    fn lookaround_crlf() {
153        let compiled = compile("(?mR)^[a-z]+$").unwrap();
154        let pretty = wasm_print_module(&compiled);
155        insta::assert_snapshot!(pretty);
156    }
157
158    #[test]
159    fn lookaround_lf() {
160        let compiled = compile("(?m)^$").unwrap();
161        let pretty = wasm_print_module(&compiled);
162        insta::assert_snapshot!(pretty);
163    }
164
165    #[test]
166    fn lookaround_is_ascii_word() {
167        let compiled = compile(r"(?-u)hello\B").unwrap();
168        let pretty = wasm_print_module(&compiled);
169        insta::assert_snapshot!(pretty);
170    }
171
172    #[test]
173    fn lookaround_is_ascii_start_end() {
174        let compiled = compile(r"(?-u:\b{start}hello\b{end})").unwrap();
175        let pretty = wasm_print_module(&compiled);
176        insta::assert_snapshot!(pretty);
177    }
178
179    #[test]
180    fn lookaround_is_ascii_half_start_end() {
181        let compiled = compile(r"(?-u:\b{start-half}hello\b{end-half})").unwrap();
182        let pretty = wasm_print_module(&compiled);
183        insta::assert_snapshot!(pretty);
184    }
185}