1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
//! Library and application for parsing hyperlinks and link reference
//! definitions in Markdown, reStructuredText, Asciidoc and HTML format. The
//! library implements the
//! [CommonMark Specification 0.30](https://spec.commonmark.org/0.30/),
//! [reStructuredText Markup Specification](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)
//! (revision 8571, date 2020-10-28),
//! [Asciidoctor User Manual, chapter 26](https://asciidoctor.org/docs/user-manual/#url) (date 2020-12-03),
//! the
//! [HTML 5.2: section 4.5.](https://www.w3.org/TR/html52/textlevel-semantics.html#the-a-element)
//! specification
//! and the [Wikitext v1.0.0](https://www.mediawiki.org/wiki/Specs/wikitext/1.0.0)
//! specification.
#![allow(dead_code)]
pub mod iterator;
pub mod parser;
pub mod renderer;
use nom::error::Error;
use nom::error::ErrorKind;
use nom::error::ParseError;
use nom::Err;
use nom::IResult;
/// A parser similar to `nom::bytes::complete::take_until()`, except that this
/// one does not stop at balanced opening and closing tags. It is designed to
/// work inside the `nom::sequence::delimited()` parser.
///
/// # Basic usage
/// ```
/// use nom::bytes::complete::tag;
/// use nom::sequence::delimited;
/// use parse_hyperlinks::take_until_unbalanced;
///
/// let mut parser = delimited(tag("<"), take_until_unbalanced('<', '>'), tag(">"));
/// assert_eq!(parser("<<inside>inside>abc"), Ok(("abc", "<inside>inside")));
/// ```
/// It skips nested brackets until it finds an extra unbalanced closing bracket. Escaped brackets
/// like `\<` and `\>` are not considered as brackets and are not counted. This function is
/// very similar to `nom::bytes::complete::take_until(">")`, except it also takes nested brackets.
pub fn take_until_unbalanced(
opening_bracket: char,
closing_bracket: char,
) -> impl Fn(&str) -> IResult<&str, &str> {
move |i: &str| {
let mut index = 0;
let mut bracket_counter = 0;
while let Some(n) = &i[index..].find(&[opening_bracket, closing_bracket, '\\'][..]) {
index += n;
let mut it = i[index..].chars();
match it.next().unwrap_or_default() {
c if c == '\\' => {
// Skip the escape char `\`.
index += '\\'.len_utf8();
// Skip also the following char.
let c = it.next().unwrap_or_default();
index += c.len_utf8();
}
c if c == opening_bracket => {
bracket_counter += 1;
index += opening_bracket.len_utf8();
}
c if c == closing_bracket => {
// Closing bracket.
bracket_counter -= 1;
index += closing_bracket.len_utf8();
}
// Can not happen.
_ => unreachable!(),
};
// We found the unmatched closing bracket.
if bracket_counter == -1 {
// We do not consume it.
index -= closing_bracket.len_utf8();
return Ok((&i[index..], &i[0..index]));
};
}
if bracket_counter == 0 {
Ok(("", i))
} else {
Err(Err::Error(Error::from_error_kind(i, ErrorKind::TakeUntil)))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use nom::error::ErrorKind;
#[test]
fn test_take_until_unmatched() {
assert_eq!(take_until_unbalanced('(', ')')("abc"), Ok(("", "abc")));
assert_eq!(
take_until_unbalanced('(', ')')("url)abc"),
Ok((")abc", "url"))
);
assert_eq!(
take_until_unbalanced('(', ')')("u()rl)abc"),
Ok((")abc", "u()rl"))
);
assert_eq!(
take_until_unbalanced('(', ')')("u(())rl)abc"),
Ok((")abc", "u(())rl"))
);
assert_eq!(
take_until_unbalanced('(', ')')("u(())r()l)abc"),
Ok((")abc", "u(())r()l"))
);
assert_eq!(
take_until_unbalanced('(', ')')("u(())r()labc"),
Ok(("", "u(())r()labc"))
);
assert_eq!(
take_until_unbalanced('(', ')')(r#"u\((\))r()labc"#),
Ok(("", r#"u\((\))r()labc"#))
);
assert_eq!(
take_until_unbalanced('(', ')')("u(())r(labc"),
Err(nom::Err::Error(nom::error::Error::new(
"u(())r(labc",
ErrorKind::TakeUntil
)))
);
assert_eq!(
take_until_unbalanced('€', 'ü')("€uü€€üürlüabc"),
Ok(("üabc", "€uü€€üürl"))
);
}
}