Fun API Documentation 0.42.1
The programming language that makes you have fun!
Loading...
Searching...
No Matches
pcre2.c
Go to the documentation of this file.
1/*
2 * This file is part of the Fun programming language.
3 * https://fun-lang.xyz/
4 *
5 * Copyright 2025 Johannes Findeisen <you@hanez.org>
6 * Licensed under the terms of the Apache-2.0 license.
7 * https://opensource.org/license/apache-2-0
8 */
9
10/**
11 * @file pcre2.c
12 * @brief PCRE2 helpers for Fun VM extension opcodes (conditional build).
13 *
14 * This module centralizes the concrete PCRE2 implementation so VM opcodes in
15 * src/vm/pcre2/*.c only perform stack marshalling and delegate to these
16 * helpers. This mirrors the approach used by other extensions (e.g. SQLite,
17 * XML2) where the heavy lifting lives under src/extensions/ and the opcodes
18 * just call into small C helpers.
19 *
20 * Build-time feature flag:
21 * - The code in this file is compiled only when FUN_WITH_PCRE2 is enabled.
22 * When disabled, PCRE2-dependent opcodes are built with no-op fallbacks.
23 *
24 * PCRE2 width configuration:
25 * - PCRE2 requires defining PCRE2_CODE_UNIT_WIDTH before including <pcre2.h> to
26 * select 8/16/32-bit code units. We select 8-bit here. Because the Fun VM
27 * translates many opcode .c files into the same translation unit, it is
28 * important this macro is defined exactly once before the first <pcre2.h>
29 * inclusion. This file ensures that when FUN_WITH_PCRE2 is enabled.
30 *
31 * Flags mapping used by helpers/opcodes (bitmask in the VM):
32 * - 1 -> PCRE2_CASELESS ("i")
33 * - 2 -> PCRE2_MULTILINE ("m")
34 * - 4 -> PCRE2_DOTALL ("s")
35 * - 8 -> PCRE2_UTF ("u")
36 * - 16 -> PCRE2_EXTENDED ("x")
37 *
38 * Value and memory ownership:
39 * - The Value type and helper functions (make_map_empty, make_array_*, map_set,
40 * array_push, make_int, make_string, make_nil, string_substr, etc.) are
41 * provided by the Fun VM and are declared in the including translation unit.
42 * The arrays/maps returned from this module are owned by the caller (the
43 * VM opcode), consistent with other extension helpers.
44 *
45 * Thread-safety:
46 * - These helpers are not inherently thread-safe, but they do not maintain any
47 * internal state beyond stack-local variables. Coordinate usage externally if
48 * the embedding is multi-threaded.
49 */
50
51/* Ensure PCRE2 is configured consistently across the whole translation unit.
52 * vm.c includes many opcode implementation .c files; some use PCRE2. For PCRE2
53 * headers to expose the correct typedefs (e.g., pcre2_code, PCRE2_SPTR), the
54 * PCRE2_CODE_UNIT_WIDTH macro must be defined before the first inclusion of
55 * <pcre2.h>. We do this once here when PCRE2 support is enabled. */
56#ifdef FUN_WITH_PCRE2
57#ifndef PCRE2_CODE_UNIT_WIDTH
58#define PCRE2_CODE_UNIT_WIDTH 8
59#endif
60#include <pcre2.h>
61#include <string.h>
62
63/**
64 * @brief Map Fun VM regex flags to PCRE2 compile options.
65 *
66 * The Fun VM passes a small integer bitmask controlling common regex
67 * behaviours. This function translates those bits into the corresponding
68 * PCRE2 compile options.
69 *
70 * Bit mapping:
71 * - 1 -> PCRE2_CASELESS (case-insensitive)
72 * - 2 -> PCRE2_MULTILINE (^ and $ match start/end of line)
73 * - 4 -> PCRE2_DOTALL (dot matches newlines)
74 * - 8 -> PCRE2_UTF (treat pattern/subject as UTF-8)
75 * - 16 -> PCRE2_EXTENDED (ignore unescaped whitespace and allow comments)
76 *
77 * @param flags Bitmask provided by the VM.
78 * @return uint32_t PCRE2 options suitable for pcre2_compile().
79 */
80static uint32_t fun_pcre2_opts_from_flags(int flags) {
81 uint32_t opt = 0;
82 if (flags & 1) opt |= PCRE2_CASELESS; /* I */
83 if (flags & 2) opt |= PCRE2_MULTILINE; /* M */
84 if (flags & 4) opt |= PCRE2_DOTALL; /* S */
85 if (flags & 8) opt |= PCRE2_UTF; /* U */
86 if (flags & 16) opt |= PCRE2_EXTENDED; /* X */
87 return opt;
88}
89
90/**
91 * @brief Test whether a pattern matches a subject at least once.
92 *
93 * Compiles the given pattern with options derived from the flags bitmask and
94 * runs pcre2_match() once starting at offset 0.
95 *
96 * @param pattern NUL-terminated regex pattern string.
97 * @param subject NUL-terminated subject string.
98 * @param flags VM bitmask translated by fun_pcre2_opts_from_flags().
99 * @return int 1 if pcre2_match() returns a non-negative value; 0 if there is
100 * no match or an error occurs (including compile error or OOM).
101 *
102 * @note This helper performs only a single match attempt at offset 0; it does
103 * not search for subsequent matches. Use fun_pcre2_findall() for that.
104 */
105static int fun_pcre2_test(const char *pattern, const char *subject, int flags) {
106 if (!pattern || !subject) return 0;
107 int errorcode = 0; PCRE2_SIZE erroff = 0;
108 uint32_t opt = fun_pcre2_opts_from_flags(flags);
109 pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, opt, &errorcode, &erroff, NULL);
110 if (!re) return 0;
111 pcre2_match_data *mdata = pcre2_match_data_create_from_pattern(re, NULL);
112 int rc = pcre2_match(re, (PCRE2_SPTR)subject, (PCRE2_SIZE)strlen(subject), 0, 0, mdata, NULL);
113 pcre2_match_data_free(mdata);
114 pcre2_code_free(re);
115 return rc >= 0 ? 1 : 0;
116}
117
118/* Build a Value API is provided by the VM; declarations are in the including TU. */
119/**
120 * @brief Match a pattern once and return a structured result map.
121 *
122 * On success, returns a map with the following keys:
123 * - "full" -> string: the matched substring for group 0
124 * - "start" -> int: start index (0-based) of the match in the subject
125 * - "end" -> int: end index (exclusive)
126 * - "groups" -> array: strings for each captured group (1..n), empty if none
127 *
128 * On no match, pattern compile failure, or memory allocation error, returns
129 * Nil.
130 *
131 * @param pattern NUL-terminated regex pattern string.
132 * @param subject NUL-terminated subject string.
133 * @param flags VM bitmask translated by fun_pcre2_opts_from_flags().
134 * @return Value A VM map Value as described above, or Nil on failure.
135 *
136 * @see fun_pcre2_findall()
137 */
138static Value fun_pcre2_match(const char *pattern, const char *subject, int flags) {
139 if (!pattern || !subject) return make_nil();
140 int errorcode = 0; PCRE2_SIZE erroff = 0;
141 uint32_t opt = fun_pcre2_opts_from_flags(flags);
142 pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, opt, &errorcode, &erroff, NULL);
143 if (!re) return make_nil();
144 pcre2_match_data *mdata = pcre2_match_data_create_from_pattern(re, NULL);
145 int rc = pcre2_match(re, (PCRE2_SPTR)subject, (PCRE2_SIZE)strlen(subject), 0, 0, mdata, NULL);
146 if (rc <= 0) {
147 pcre2_match_data_free(mdata);
148 pcre2_code_free(re);
149 return make_nil();
150 }
151 PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata);
152 Value res = make_map_empty();
153 int start0 = (int)ov[0];
154 int end0 = (int)ov[1];
155 char *full = string_substr(subject, start0, end0 - start0);
156 (void)map_set(&res, "full", make_string(full ? full : ""));
157 if (full) free(full);
158 (void)map_set(&res, "start", make_int(start0));
159 (void)map_set(&res, "end", make_int(end0));
160 Value groups = make_array_from_values(NULL, 0);
161 for (int i = 1; i < rc; ++i) {
162 int s = (int)ov[2 * i];
163 int e = (int)ov[2 * i + 1];
164 char *gstr = (s >= 0 && e >= s) ? string_substr(subject, s, e - s) : NULL;
165 Value gv = make_string(gstr ? gstr : "");
166 if (gstr) free(gstr);
167 (void)array_push(&groups, gv);
168 }
169 (void)map_set(&res, "groups", groups);
170 pcre2_match_data_free(mdata);
171 pcre2_code_free(re);
172 return res;
173}
174
175/**
176 * @brief Find all non-overlapping matches of a pattern in a subject.
177 *
178 * Scans the subject from left to right and appends, for each non-overlapping
179 * match, a map with the same shape as fun_pcre2_match() to the result array.
180 * If the engine reports an empty match (start == end), the scan advances by a
181 * single code unit to prevent infinite loops.
182 *
183 * On pattern compile failure or allocation error, returns an empty array.
184 *
185 * @param pattern NUL-terminated regex pattern string.
186 * @param subject NUL-terminated subject string.
187 * @param flags VM bitmask translated by fun_pcre2_opts_from_flags().
188 * @return Value An array of match maps; may be empty when no matches are found
189 * or on error.
190 *
191 * @see fun_pcre2_match()
192 */
193static Value fun_pcre2_findall(const char *pattern, const char *subject, int flags) {
194 Value out = make_array_from_values(NULL, 0);
195 if (!pattern || !subject) return out;
196 int errorcode = 0; PCRE2_SIZE erroff = 0;
197 uint32_t opt = fun_pcre2_opts_from_flags(flags);
198 pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, opt, &errorcode, &erroff, NULL);
199 if (!re) return out;
200 pcre2_match_data *mdata = pcre2_match_data_create_from_pattern(re, NULL);
201 size_t subj_len = strlen(subject);
202 size_t start_off = 0;
203 while (1) {
204 int rc = pcre2_match(re, (PCRE2_SPTR)subject, (PCRE2_SIZE)subj_len, start_off, 0, mdata, NULL);
205 if (rc <= 0) break;
206 PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata);
207 int s0 = (int)ov[0];
208 int e0 = (int)ov[1];
209 Value res = make_map_empty();
210 char *full = string_substr(subject, s0, e0 - s0);
211 (void)map_set(&res, "full", make_string(full ? full : ""));
212 if (full) free(full);
213 (void)map_set(&res, "start", make_int(s0));
214 (void)map_set(&res, "end", make_int(e0));
215 Value groups = make_array_from_values(NULL, 0);
216 for (int i = 1; i < rc; ++i) {
217 int s = (int)ov[2 * i];
218 int e = (int)ov[2 * i + 1];
219 char *gstr = (s >= 0 && e >= s) ? string_substr(subject, s, e - s) : NULL;
220 Value gv = make_string(gstr ? gstr : "");
221 if (gstr) free(gstr);
222 (void)array_push(&groups, gv);
223 }
224 (void)map_set(&res, "groups", groups);
225 (void)array_push(&out, res);
226 /* Advance safely (guard against empty match). */
227 if (e0 == s0) {
228 if ((size_t)e0 < subj_len) start_off = e0 + 1; else break;
229 } else {
230 start_off = e0;
231 }
232 }
233 pcre2_match_data_free(mdata);
234 pcre2_code_free(re);
235 return out;
236}
237#endif /* FUN_WITH_PCRE2 */
int map_set(Value *vm, const char *key, Value v)
Insert or replace a key in the map.
Definition map.c:79
Value make_map_empty(void)
Construct a new empty map Value.
Definition map.c:35
static Value fun_pcre2_findall(const char *pattern, const char *subject, int flags)
Find all non-overlapping matches of a pattern in a subject.
Definition pcre2.c:193
static int fun_pcre2_test(const char *pattern, const char *subject, int flags)
Test whether a pattern matches a subject at least once.
Definition pcre2.c:105
static uint32_t fun_pcre2_opts_from_flags(int flags)
Map Fun VM regex flags to PCRE2 compile options.
Definition pcre2.c:80
static Value fun_pcre2_match(const char *pattern, const char *subject, int flags)
Match a pattern once and return a structured result map.
Definition pcre2.c:138
char * string_substr(const char *s, int start, int len)
Create a newly allocated substring of s.
Definition str_utils.c:35
Tagged union representing a Fun value.
Definition value.h:68
Value make_nil(void)
Construct a nil Value.
Definition value.c:126
Value make_string(const char *s)
Construct a string Value by duplicating the given C string.
Definition value.c:95
int array_push(Value *v, Value newElem)
Append a Value to an array.
Definition value.c:257
Value make_int(int64_t v)
Construct a Value representing a 64-bit integer.
Definition value.c:51
Value make_array_from_values(const Value *vals, int count)
Create an array Value by copying items from an input span.
Definition value.c:142