1use std::alloc::{Layout, LayoutError};
5
6use regex_automata::{
7 nfa::thompson::NFA,
8 util::look::{Look, LookMatcher, LookSet},
9};
10use wasm_encoder::{BlockType, InstructionSink, MemArg, NameMap, ValType};
11
12use crate::util::repeat;
13
14use super::{
15 context::{
16 ActiveDataSegment, CompileContext, FunctionDefinition, FunctionIdx, FunctionSignature,
17 },
18 BuildError,
19};
20
21#[derive(Debug)]
23pub struct LookLayout {
24 is_word_byte_table: Option<IsWordByteTable>,
25}
26
27#[derive(Debug)]
28struct IsWordByteTable {
29 position: u64,
30}
31
32impl LookLayout {
33 const UTF8_IS_WORD_BYTE_LUT: [bool; 256] = {
40 let mut set = [false; 256];
41 set[b'_' as usize] = true;
42
43 let mut byte = b'0';
44 while byte <= b'9' {
45 set[byte as usize] = true;
46 byte += 1;
47 }
48 byte = b'A';
49 while byte <= b'Z' {
50 set[byte as usize] = true;
51 byte += 1;
52 }
53 byte = b'a';
54 while byte <= b'z' {
55 set[byte as usize] = true;
56 byte += 1;
57 }
58 set
59 };
60
61 pub fn new(
63 ctx: &mut CompileContext,
64 mut overall: Layout,
65 ) -> Result<(Layout, Self), LayoutError> {
66 let look_set = modified_lookset_for_dependencies(&ctx.nfa);
67 let is_word_byte_table = if needs_is_word_byte_lut(look_set) {
68 let (table_layout, _table_stride) = repeat(&Layout::new::<u8>(), 256)?;
69
70 let (new_overall, table_pos) = overall.extend(table_layout)?;
71 overall = new_overall;
72
73 ctx.sections.add_active_data_segment(ActiveDataSegment {
74 name: "utf8_is_word_byte_table".into(),
75 data: Self::UTF8_IS_WORD_BYTE_LUT
76 .into_iter()
77 .map(|b| b as u8)
78 .collect(),
79 position: table_pos,
80 });
81
82 Some(IsWordByteTable {
83 position: table_pos.try_into().expect("position should fit in u64"),
84 })
85 } else {
86 None
87 };
88
89 Ok((overall, Self { is_word_byte_table }))
90 }
91}
92
93#[derive(Debug)]
95pub struct LookFunctions {
96 look_matches: [Option<FunctionIdx>; Self::NUM_LOOKS],
97}
98
99impl LookFunctions {
100 const NUM_LOOKS: usize = (Look::WordEndHalfUnicode.as_repr().ilog2() as usize) + 1;
104
105 pub fn new(ctx: &mut CompileContext, layout: &LookLayout) -> Result<Self, BuildError> {
107 let mut look_matches = [None; Self::NUM_LOOKS];
108 let look_set = modified_lookset_for_dependencies(&ctx.nfa);
109
110 if look_set.is_empty() {
111 return Ok(Self { look_matches });
112 }
113
114 for look in look_set.iter() {
115 if is_unsupported_lookaround(look) {
116 return Err(BuildError::unsupported(format!(
118 "{look:?}/{} is not yet implemented",
119 look.as_char()
120 )));
121 }
122
123 let sig = lookaround_fn_signature(lookaround_fn_name(look));
124
125 let func = ctx.declare_function(sig);
126 look_matches[look.as_repr().ilog2() as usize] = Some(func);
127 }
128
129 let look_matcher = ctx.nfa.look_matcher().clone();
130 for (look, func_idx) in look_set
131 .iter()
132 .map(|look| (look, look_matches[look.as_repr().ilog2() as usize]))
133 {
134 let func_def = match look {
135 Look::Start => Self::is_start_fn(),
136 Look::End => Self::is_end_fn(),
137 Look::StartLF => Self::is_start_lf_fn(&look_matcher),
138 Look::EndLF => Self::is_end_lf_fn(&look_matcher),
139 Look::StartCRLF => Self::is_start_crlf_fn(),
140 Look::EndCRLF => Self::is_end_crlf_fn(),
141 Look::WordAscii => Self::is_word_ascii_fn(
142 layout
143 .is_word_byte_table
144 .as_ref()
145 .expect("should have generated table"),
146 ),
147 Look::WordAsciiNegate => Self::is_word_ascii_negate_fn(
148 look_matches[Look::WordAscii.as_repr().ilog2() as usize]
149 .expect("should have generated `look_is_word_ascii` function"),
151 ),
152 Look::WordStartAscii => Self::is_word_start_ascii_fn(
153 layout
154 .is_word_byte_table
155 .as_ref()
156 .expect("should have generated table"),
157 ),
158 Look::WordEndAscii => Self::is_word_end_ascii_fn(
159 layout
160 .is_word_byte_table
161 .as_ref()
162 .expect("should have generated table"),
163 ),
164 Look::WordStartHalfAscii => Self::is_word_start_half_ascii_fn(
165 layout
166 .is_word_byte_table
167 .as_ref()
168 .expect("should have generated table"),
169 ),
170 Look::WordEndHalfAscii => Self::is_word_end_half_ascii_fn(
171 layout
172 .is_word_byte_table
173 .as_ref()
174 .expect("should have generated table"),
175 ),
176 _ => unreachable!("Should be unreachable due to first loop through lookset"),
177 };
178
179 ctx.define_function(
180 func_idx.expect("should have generated function for supported lookaround"),
181 func_def,
182 );
183 }
184
185 Ok(Self { look_matches })
186 }
187
188 pub fn look_matcher(&self, look: Look) -> Option<FunctionIdx> {
190 self.look_matches[look.as_repr().ilog2() as usize]
191 }
192
193 fn is_start_fn() -> FunctionDefinition {
194 let locals_name_map = lookaround_fn_common_name_map();
195
196 let mut body = wasm_encoder::Function::new([]);
202 body.instructions()
205 .local_get(2)
207 .i64_eqz()
208 .end();
209
210 FunctionDefinition {
211 body,
212 locals_name_map,
213 labels_name_map: None,
214 branch_hints: None,
215 }
216 }
217
218 fn is_end_fn() -> FunctionDefinition {
219 let locals_name_map = lookaround_fn_common_name_map();
220
221 let mut body = wasm_encoder::Function::new([]);
227 body.instructions()
228 .local_get(1)
230 .local_get(2)
231 .i64_eq()
232 .end();
233
234 FunctionDefinition {
235 body,
236 locals_name_map,
237 labels_name_map: None,
238 branch_hints: None,
239 }
240 }
241
242 fn is_start_lf_fn(look_matcher: &LookMatcher) -> FunctionDefinition {
243 let locals_name_map = lookaround_fn_common_name_map();
244
245 let mut body = wasm_encoder::Function::new([]);
251 body.instructions()
252 .local_get(2)
254 .i64_eqz()
255 .if_(BlockType::Empty)
256 .i32_const(true as i32)
258 .return_()
259 .end()
260 .local_get(2)
262 .i64_const(1)
263 .i64_sub()
264 .local_get(0)
265 .i64_add()
266 .i32_load8_u(MemArg {
267 offset: 0,
268 align: 0, memory_index: 0, })
271 .i32_const(look_matcher.get_line_terminator() as i32)
272 .i32_eq()
273 .end();
274
275 FunctionDefinition {
276 body,
277 locals_name_map,
278 labels_name_map: None,
279 branch_hints: None,
280 }
281 }
282
283 fn is_end_lf_fn(look_matcher: &LookMatcher) -> FunctionDefinition {
284 let locals_name_map = lookaround_fn_common_name_map();
285
286 let mut body = wasm_encoder::Function::new([]);
292 body.instructions()
293 .local_get(2)
295 .local_get(1)
296 .i64_eq()
297 .if_(BlockType::Empty)
298 .i32_const(true as i32)
300 .return_()
301 .end()
302 .local_get(2)
304 .local_get(0)
305 .i64_add()
306 .i32_load8_u(MemArg {
307 offset: 0,
308 align: 0, memory_index: 0, })
311 .i32_const(look_matcher.get_line_terminator() as i32)
312 .i32_eq()
313 .end();
314
315 FunctionDefinition {
316 body,
317 locals_name_map,
318 labels_name_map: None,
319 branch_hints: None,
320 }
321 }
322
323 fn is_start_crlf_fn() -> FunctionDefinition {
324 let locals_name_map = lookaround_fn_common_name_map();
325
326 let mut body = wasm_encoder::Function::new([]);
344 body.instructions()
345 .local_get(2)
347 .i64_eqz()
348 .if_(BlockType::Empty)
349 .i32_const(true as i32)
350 .return_()
351 .end()
352 .local_get(2)
354 .i64_const(1)
355 .i64_sub()
356 .local_get(0)
357 .i64_add()
358 .i32_load8_u(MemArg {
359 offset: 0,
360 align: 0, memory_index: 0, })
363 .i32_const(b'\n' as i32)
364 .i32_eq()
365 .if_(BlockType::Empty)
366 .i32_const(true as i32)
367 .return_()
368 .end()
369 .local_get(2)
371 .i64_const(1)
372 .i64_sub()
373 .local_get(0)
374 .i64_add()
375 .i32_load8_u(MemArg {
376 offset: 0,
377 align: 0, memory_index: 0, })
380 .i32_const(b'\r' as i32)
381 .i32_ne()
382 .if_(BlockType::Empty)
383 .i32_const(false as i32)
384 .return_()
385 .end()
386 .local_get(2)
388 .local_get(1)
389 .i64_ge_u()
390 .if_(BlockType::Empty)
391 .i32_const(true as i32)
392 .return_()
393 .end()
394 .local_get(2)
396 .local_get(0)
397 .i64_add()
398 .i32_load8_u(MemArg {
399 offset: 0,
400 align: 0, memory_index: 0, })
403 .i32_const(b'\n' as i32)
404 .i32_ne()
405 .end();
406
407 FunctionDefinition {
408 body,
409 locals_name_map,
410 labels_name_map: None,
411 branch_hints: None,
412 }
413 }
414
415 fn is_end_crlf_fn() -> FunctionDefinition {
416 let locals_name_map = lookaround_fn_common_name_map();
417
418 let mut body = wasm_encoder::Function::new([]);
440
441 body.instructions()
442 .local_get(2)
444 .local_get(1)
445 .i64_eq()
446 .if_(BlockType::Empty)
447 .i32_const(true as i32)
448 .return_()
449 .end()
450 .local_get(2)
452 .local_get(0)
453 .i64_add()
454 .i32_load8_u(MemArg {
455 offset: 0,
456 align: 0, memory_index: 0, })
459 .i32_const(b'\r' as i32)
460 .i32_eq()
461 .if_(BlockType::Empty)
462 .i32_const(true as i32)
463 .return_()
464 .end()
465 .local_get(2)
467 .local_get(0)
468 .i64_add()
469 .i32_load8_u(MemArg {
470 offset: 0,
471 align: 0, memory_index: 0, })
474 .i32_const(b'\n' as i32)
475 .i32_ne()
476 .if_(BlockType::Empty)
477 .i32_const(false as i32)
478 .return_()
479 .end()
480 .local_get(2)
482 .i64_eqz()
483 .if_(BlockType::Empty)
484 .i32_const(true as i32)
485 .return_()
486 .end()
487 .local_get(2)
489 .i64_const(1)
490 .i64_sub()
491 .local_get(0)
492 .i64_add()
493 .i32_load8_u(MemArg {
494 offset: 0,
495 align: 0, memory_index: 0, })
498 .i32_const(b'\r' as i32)
499 .i32_ne()
500 .end();
501
502 FunctionDefinition {
503 body,
504 locals_name_map,
505 labels_name_map: None,
506 branch_hints: None,
507 }
508 }
509
510 fn is_word_ascii_fn(is_word_byte_table: &IsWordByteTable) -> FunctionDefinition {
511 let mut locals_name_map = lookaround_fn_common_name_map();
512 locals_name_map.append(3, "word_before");
514
515 let mut body = wasm_encoder::Function::new([(2, ValType::I32)]);
522
523 let mut instructions = body.instructions();
524 Self::word_before_ascii_instructions(&mut instructions, is_word_byte_table);
525 instructions.local_set(3);
526
527 Self::word_after_ascii_instructions(&mut instructions, is_word_byte_table);
528 instructions
529 .local_get(3)
531 .i32_ne()
532 .end();
533
534 FunctionDefinition {
535 body,
536 locals_name_map,
537 labels_name_map: None,
538 branch_hints: None,
539 }
540 }
541
542 fn is_word_ascii_negate_fn(is_word_ascii: FunctionIdx) -> FunctionDefinition {
543 let locals_name_map = lookaround_fn_common_name_map();
544
545 let mut body = wasm_encoder::Function::new([]);
551 body.instructions()
552 .local_get(0)
553 .local_get(1)
554 .local_get(2)
555 .call(is_word_ascii.into())
556 .i32_const(1)
557 .i32_xor()
558 .end();
559
560 FunctionDefinition {
561 body,
562 locals_name_map,
563 labels_name_map: None,
564 branch_hints: None,
565 }
566 }
567
568 fn is_word_start_ascii_fn(is_word_byte_table: &IsWordByteTable) -> FunctionDefinition {
569 let mut locals_name_map = lookaround_fn_common_name_map();
570 locals_name_map.append(3, "word_before");
572
573 let mut body = wasm_encoder::Function::new([(2, ValType::I32)]);
580
581 let mut instructions = body.instructions();
582 Self::word_before_ascii_instructions(&mut instructions, is_word_byte_table);
583 instructions.local_set(3);
584
585 Self::word_after_ascii_instructions(&mut instructions, is_word_byte_table);
586 instructions
587 .local_get(3)
589 .i32_const(1)
590 .i32_xor()
591 .i32_and()
592 .end();
593
594 FunctionDefinition {
595 body,
596 locals_name_map,
597 labels_name_map: None,
598 branch_hints: None,
599 }
600 }
601
602 fn is_word_end_ascii_fn(is_word_byte_table: &IsWordByteTable) -> FunctionDefinition {
603 let mut locals_name_map = lookaround_fn_common_name_map();
604 locals_name_map.append(3, "word_before");
606
607 let mut body = wasm_encoder::Function::new([(2, ValType::I32)]);
614
615 let mut instructions = body.instructions();
616 Self::word_before_ascii_instructions(&mut instructions, is_word_byte_table);
617 instructions.local_set(3);
618
619 Self::word_after_ascii_instructions(&mut instructions, is_word_byte_table);
620 instructions
621 .i32_const(1)
623 .i32_xor()
624 .local_get(3)
625 .i32_and()
626 .end();
627
628 FunctionDefinition {
629 body,
630 locals_name_map,
631 labels_name_map: None,
632 branch_hints: None,
633 }
634 }
635
636 fn is_word_start_half_ascii_fn(is_word_byte_table: &IsWordByteTable) -> FunctionDefinition {
637 let locals_name_map = lookaround_fn_common_name_map();
638
639 let mut body = wasm_encoder::Function::new([]);
646
647 let mut instructions = body.instructions();
648 Self::word_before_ascii_instructions(&mut instructions, is_word_byte_table);
649 instructions
650 .i32_const(1)
652 .i32_xor()
653 .end();
654
655 FunctionDefinition {
656 body,
657 locals_name_map,
658 labels_name_map: None,
659 branch_hints: None,
660 }
661 }
662
663 fn is_word_end_half_ascii_fn(is_word_byte_table: &IsWordByteTable) -> FunctionDefinition {
664 let locals_name_map = lookaround_fn_common_name_map();
665
666 let mut body = wasm_encoder::Function::new([]);
673 let mut instructions = body.instructions();
674
675 Self::word_after_ascii_instructions(&mut instructions, is_word_byte_table);
676 instructions
677 .i32_const(1)
679 .i32_xor()
680 .end();
681
682 FunctionDefinition {
683 body,
684 locals_name_map,
685 labels_name_map: None,
686 branch_hints: None,
687 }
688 }
689
690 fn word_before_ascii_instructions(
691 instructions: &mut InstructionSink,
692 is_word_byte_table: &IsWordByteTable,
693 ) {
694 instructions
704 .local_get(2)
706 .i64_eqz()
707 .if_(BlockType::Result(ValType::I32))
708 .i32_const(false as i32)
709 .else_()
710 .local_get(2)
712 .i64_const(1)
713 .i64_sub()
714 .local_get(0)
715 .i64_add()
716 .i32_load8_u(MemArg {
717 offset: 0,
718 align: 0, memory_index: 0, })
721 .i64_extend_i32_u()
722 .i32_load8_u(MemArg {
723 offset: is_word_byte_table.position,
724 align: 0, memory_index: 1,
726 })
727 .end();
728 }
729
730 fn word_after_ascii_instructions(
731 instructions: &mut InstructionSink,
732 is_word_byte_table: &IsWordByteTable,
733 ) {
734 instructions
744 .local_get(2)
747 .local_get(1)
748 .i64_ge_u()
749 .if_(BlockType::Result(ValType::I32))
750 .i32_const(false as i32)
751 .else_()
752 .local_get(2)
754 .local_get(0)
755 .i64_add()
756 .i32_load8_u(MemArg {
757 offset: 0,
758 align: 0, memory_index: 0, })
761 .i64_extend_i32_u()
762 .i32_load8_u(MemArg {
763 offset: is_word_byte_table.position,
764 align: 0, memory_index: 1,
766 })
767 .end();
768 }
769}
770
771fn lookaround_fn_signature(name: &str) -> FunctionSignature {
772 FunctionSignature {
773 name: name.into(),
774 params_ty: &[ValType::I64, ValType::I64, ValType::I64],
776 results_ty: &[ValType::I32],
778 export: false,
779 }
780}
781
782fn lookaround_fn_common_name_map() -> NameMap {
783 let mut locals_name_map = NameMap::new();
784 locals_name_map.append(0, "haystack_ptr");
785 locals_name_map.append(1, "haystack_len");
786 locals_name_map.append(2, "at_offset");
787
788 locals_name_map
789}
790
791fn lookaround_fn_name(look: Look) -> &'static str {
792 match look {
793 Look::Start => "look_is_start",
794 Look::End => "look_is_end",
795 Look::StartLF => "look_is_start_lf",
796 Look::EndLF => "look_is_end_lf",
797 Look::StartCRLF => "look_is_start_crlf",
798 Look::EndCRLF => "look_is_end_crlf",
799 Look::WordAscii => "look_is_word_ascii",
800 Look::WordAsciiNegate => "look_is_word_ascii_negate",
801 Look::WordUnicode => "look_is_word_unicode",
802 Look::WordUnicodeNegate => "look_is_word_unicode_negate",
803 Look::WordStartAscii => "look_is_word_start_ascii",
804 Look::WordEndAscii => "look_is_word_end_ascii",
805 Look::WordStartUnicode => "look_is_word_start_unicode",
806 Look::WordEndUnicode => "look_is_word_end_unicode",
807 Look::WordStartHalfAscii => "look_is_word_start_half_ascii",
808 Look::WordEndHalfAscii => "look_is_word_end_half_ascii",
809 Look::WordStartHalfUnicode => "look_is_word_start_half_unicode",
810 Look::WordEndHalfUnicode => "look_is_word_end_half_unicode",
811 }
812}
813
814fn is_unsupported_lookaround(look: Look) -> bool {
815 matches!(
816 look,
817 Look::WordUnicode
818 | Look::WordUnicodeNegate
819 | Look::WordStartUnicode
820 | Look::WordEndUnicode
821 | Look::WordStartHalfUnicode
822 | Look::WordEndHalfUnicode
823 )
824}
825
826fn needs_is_word_byte_lut(look_set: LookSet) -> bool {
827 look_set.contains(Look::WordAscii)
828 || look_set.contains(Look::WordAsciiNegate)
829 || look_set.contains(Look::WordStartAscii)
830 || look_set.contains(Look::WordEndAscii)
831 || look_set.contains(Look::WordStartHalfAscii)
832 || look_set.contains(Look::WordEndHalfAscii)
833}
834
835fn modified_lookset_for_dependencies(nfa: &NFA) -> LookSet {
836 let mut look_set = nfa.look_set_any();
837
838 if look_set.contains(Look::WordAsciiNegate) {
841 look_set = look_set.insert(Look::WordAscii);
842 }
843
844 look_set
845}