wayver's git archive


an obsidian renderer
git clone https://git.wayver.dev/sable

sable-markdown/src/parser/blocks/html_block.rs@main

raw
Date Commit Message Author Files + -
2026-02-23 01:55 initial mvp wayverd 139 17808 0
...

1use nom::{
2    IResult, Parser,
3    branch::alt,
4    bytes::complete::{tag, tag_no_case},
5    character::complete::{
6        alpha1, alphanumeric1, anychar, char, line_ending, one_of, satisfy, space0, space1,
7    },
8    combinator::{eof, not, opt, peek, recognize, value, verify},
9    multi::{many_m_n, many0, many1},
10    sequence::{delimited, pair, preceded, terminated},
11};
12
13pub(super) fn html_block() -> impl FnMut(&str) -> IResult<&str, &str> {
14    move |input: &str| {
15        alt((
16            html_block1(),
17            html_block2(),
18            html_block3(),
19            html_block4(),
20            html_block5(),
21            html_block6(),
22            html_block7(),
23        ))
24        .parse(input)
25    }
26}
27
28fn html_block1() -> impl FnMut(&str) -> IResult<&str, &str> {
29    move |input: &str| {
30        let tag_variant_parser = || {
31            alt((
32                tag_no_case("script"),
33                tag_no_case("pre"),
34                tag_no_case("style"),
35            ))
36        };
37
38        let end_parser = || delimited(tag("</"), tag_variant_parser(), char('>'));
39
40        preceded(
41            many_m_n(0, 3, char(' ')),
42            recognize((
43                char('<'),
44                tag_variant_parser(),
45                alt((
46                    value((), char(' ')),
47                    value((), char('>')),
48                    value((), line_ending),
49                )),
50                many0(pair(peek(not(end_parser())), anychar)),
51                end_parser(),
52            )),
53        )
54        .parse(input)
55    }
56}
57
58fn html_block2() -> impl FnMut(&str) -> IResult<&str, &str> {
59    move |input: &str| {
60        preceded(
61            many_m_n(0, 3, char(' ')),
62            recognize((
63                tag("<!--"),
64                many0(pair(peek(not(tag("-->"))), anychar)),
65                tag("-->"),
66            )),
67        )
68        .parse(input)
69    }
70}
71
72fn html_block3() -> impl FnMut(&str) -> IResult<&str, &str> {
73    move |input: &str| {
74        preceded(
75            many_m_n(0, 3, char(' ')),
76            recognize((
77                tag("<?"),
78                many0(pair(peek(not(tag("?>"))), anychar)),
79                tag("?>"),
80            )),
81        )
82        .parse(input)
83    }
84}
85
86fn html_block4() -> impl FnMut(&str) -> IResult<&str, &str> {
87    move |input: &str| {
88        preceded(
89            many_m_n(0, 3, char(' ')),
90            recognize((
91                tag("<!"),
92                satisfy(|c| c.is_ascii_uppercase()),
93                many0(pair(peek(not(char('>'))), anychar)),
94                tag(">"),
95            )),
96        )
97        .parse(input)
98    }
99}
100
101fn html_block5() -> impl FnMut(&str) -> IResult<&str, &str> {
102    move |input: &str| {
103        preceded(
104            many_m_n(0, 3, char(' ')),
105            recognize((
106                tag("<![CDATA["),
107                many0(pair(peek(not(tag("]]>"))), anychar)),
108                tag("]]>"),
109            )),
110        )
111        .parse(input)
112    }
113}
114
115fn html_block6() -> impl FnMut(&str) -> IResult<&str, &str> {
116    move |input: &str| {
117        let tag_variant = alt((
118            alt((
119                tag_no_case("address"),
120                tag_no_case("article"),
121                tag_no_case("aside"),
122                tag_no_case("a"),
123                tag_no_case("base"),
124                tag_no_case("basefont"),
125                tag_no_case("blockquote"),
126                tag_no_case("body"),
127                tag_no_case("caption"),
128                tag_no_case("center"),
129                tag_no_case("col"),
130                tag_no_case("colgroup"),
131            )),
132            alt((
133                tag_no_case("dd"),
134                tag_no_case("details"),
135                tag_no_case("dialog"),
136                tag_no_case("dir"),
137                tag_no_case("div"),
138                tag_no_case("dl"),
139                tag_no_case("dt"),
140                tag_no_case("fieldset"),
141                tag_no_case("figcaption"),
142                tag_no_case("figure"),
143                tag_no_case("footer"),
144                tag_no_case("form"),
145                tag_no_case("frame"),
146                tag_no_case("frameset"),
147            )),
148            alt((
149                tag_no_case("h1"),
150                tag_no_case("h2"),
151                tag_no_case("h3"),
152                tag_no_case("h4"),
153                tag_no_case("h5"),
154                tag_no_case("h6"),
155                tag_no_case("head"),
156                tag_no_case("header"),
157                tag_no_case("hr"),
158                tag_no_case("html"),
159                tag_no_case("iframe"),
160                tag_no_case("legend"),
161            )),
162            alt((
163                tag_no_case("li"),
164                tag_no_case("link"),
165                tag_no_case("main"),
166                tag_no_case("menu"),
167                tag_no_case("menuitem"),
168                tag_no_case("nav"),
169                tag_no_case("noframes"),
170                tag_no_case("ol"),
171                tag_no_case("optgroup"),
172                tag_no_case("option"),
173                tag_no_case("p"),
174                tag_no_case("param"),
175            )),
176            alt((
177                tag_no_case("section"),
178                tag_no_case("source"),
179                tag_no_case("span"),
180                tag_no_case("summary"),
181                tag_no_case("table"),
182                tag_no_case("tbody"),
183                tag_no_case("td"),
184                tag_no_case("tfoot"),
185                tag_no_case("th"),
186                tag_no_case("thead"),
187                tag_no_case("title"),
188                tag_no_case("tr"),
189                tag_no_case("track"),
190                tag_no_case("ul"),
191            )),
192        ));
193        let end_parser = || {
194            alt((
195                value((), terminated(line_ending, (space0, line_ending))),
196                value((), eof),
197            ))
198        };
199
200        preceded(
201            many_m_n(0, 3, char(' ')),
202            recognize((
203                alt((value((), tag("</")), value((), char('<')))),
204                tag_variant,
205                alt((
206                    value((), char(' ')),
207                    value((), line_ending),
208                    value((), tag("/>")),
209                    value((), char('>')),
210                )),
211                many0(pair(peek(not(end_parser())), anychar)),
212                opt(line_ending),
213            )),
214        )
215        .parse(input)
216    }
217}
218
219fn html_block7() -> impl FnMut(&str) -> IResult<&str, &str> {
220    move |input: &str| {
221        let end_parser = || {
222            alt((
223                value((), (line_ending, space0, line_ending)),
224                value((), eof),
225            ))
226        };
227
228        preceded(
229            many_m_n(0, 3, char(' ')),
230            recognize((
231                alt((
232                    complete_open_html_tag(&["script", "pre", "style"]),
233                    complete_closing_html_tag,
234                )),
235                alt((value((), line_ending), value((), char(' ')))),
236                many0(pair(peek(not(end_parser())), anychar)),
237                end_parser(),
238            )),
239        )
240        .parse(input)
241    }
242}
243
244fn complete_open_html_tag(
245    restricted_tags: &'static [&'static str],
246) -> impl FnMut(&str) -> IResult<&str, &str> {
247    move |input: &str| {
248        recognize((
249            char('<'),
250            verify(html_tag_name, |s: &str| {
251                !restricted_tags
252                    .iter()
253                    .any(|tag| tag.eq_ignore_ascii_case(s))
254            }),
255            many0(html_tag_attribute),
256            space0,
257            opt(char('/')),
258            char('>'),
259        ))
260        .parse(input)
261    }
262}
263
264fn complete_closing_html_tag(input: &str) -> IResult<&str, &str> {
265    recognize((tag("</"), html_tag_name, space0, char('>'))).parse(input)
266}
267
268fn html_tag_name(input: &str) -> IResult<&str, &str> {
269    recognize((
270        alpha1,
271        many0(alt((value((), char('-')), value((), alphanumeric1)))),
272    ))
273    .parse(input)
274}
275
276fn html_tag_attribute(input: &str) -> IResult<&str, &str> {
277    recognize((
278        space1,
279        html_tag_attribute_name,
280        opt(html_tag_attribute_value_specification),
281    ))
282    .parse(input)
283}
284
285fn html_tag_attribute_name(input: &str) -> IResult<&str, &str> {
286    recognize((
287        alt((value((), alpha1), value((), one_of("_:")))),
288        many0(alt((value((), one_of("_.:-")), value((), alphanumeric1)))),
289    ))
290    .parse(input)
291}
292
293fn html_tag_attribute_value_specification(input: &str) -> IResult<&str, &str> {
294    recognize((space0, char('='), space0, html_tag_attribute_value)).parse(input)
295}
296
297fn html_tag_attribute_value(input: &str) -> IResult<&str, &str> {
298    alt((
299        html_tag_attribute_value_unquoted,
300        html_tag_attribute_value_single_quoted,
301        html_tag_attribute_value_double_quoted,
302    ))
303    .parse(input)
304}
305
306fn html_tag_attribute_value_unquoted(input: &str) -> IResult<&str, &str> {
307    recognize(many1(pair(
308        peek(not(alt((value((), space1), value((), one_of("\"'=<>`")))))),
309        anychar,
310    )))
311    .parse(input)
312}
313
314fn html_tag_attribute_value_single_quoted(input: &str) -> IResult<&str, &str> {
315    recognize(delimited(
316        char('\''),
317        pair(peek(not(char('\''))), anychar),
318        char('\''),
319    ))
320    .parse(input)
321}
322
323fn html_tag_attribute_value_double_quoted(input: &str) -> IResult<&str, &str> {
324    recognize(delimited(
325        char('"'),
326        pair(peek(not(char('"'))), anychar),
327        char('"'),
328    ))
329    .parse(input)
330}
331