From bbf120f753013b992ef120490d4d7993a1ac3de2 Mon Sep 17 00:00:00 2001 From: videogame hacker Date: Sat, 7 Aug 2021 10:34:53 +0100 Subject: [PATCH] Initial commit --- .gitignore | 1 + Cargo.lock | 49 ++++++++++++++++++++++ Cargo.toml | 8 ++++ README.md | 10 +++++ src/ast.rs | 22 ++++++++++ src/convert.rs | 52 +++++++++++++++++++++++ src/lib.rs | 13 ++++++ src/parse_basic.rs | 87 +++++++++++++++++++++++++++++++++++++++ src/parse_code.rs | 36 ++++++++++++++++ src/parse_inline_style.rs | 84 +++++++++++++++++++++++++++++++++++++ src/parse_link.rs | 37 +++++++++++++++++ src/parse_quotes.rs | 49 ++++++++++++++++++++++ 12 files changed, 448 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 src/ast.rs create mode 100644 src/convert.rs create mode 100644 src/lib.rs create mode 100644 src/parse_basic.rs create mode 100644 src/parse_code.rs create mode 100644 src/parse_inline_style.rs create mode 100644 src/parse_link.rs create mode 100644 src/parse_quotes.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..d70d575 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,49 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "discord_message_format" +version = "0.1.0" +dependencies = [ + "lazy_static", + "regex", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "memchr" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" + +[[package]] +name = "regex" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e33551e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "discord_message_format" +version = "0.1.0" +edition = "2018" + +[dependencies] +lazy_static = "1.4.0" +regex = "1.5.4" diff --git a/README.md b/README.md new file mode 100644 index 0000000..70534e6 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# discord_message_format + +AST Construction for the format that Discord use for messages. + +This lets you be formatting-aware when you parse user messages when writing bots. + +## To Do + +- User/role mentions +- Emoji diff --git a/src/ast.rs b/src/ast.rs new file mode 100644 index 0000000..b1c8885 --- /dev/null +++ b/src/ast.rs @@ -0,0 +1,22 @@ +#[derive(Debug)] +pub enum DiscordComponent<'a> { + Plain(&'a str), + Literal(char), + Link(&'a str), + + Bold(Vec>), + Italic(Vec>), + Strikethrough(Vec>), + Underline(Vec>), + + Code(&'a str), + CodeBlock { + lang: Option<&'a str>, + source: &'a str, + }, + + Spoiler(Vec>), + + LineBreak, + Quote(Vec>), +} diff --git a/src/convert.rs b/src/convert.rs new file mode 100644 index 0000000..a8530a5 --- /dev/null +++ b/src/convert.rs @@ -0,0 +1,52 @@ +use super::DiscordComponent; + +pub trait ToHtml { + fn to_html(&self) -> String; +} + +impl<'a> ToHtml for DiscordComponent<'a> { + fn to_html(&self) -> String { + match self { + DiscordComponent::Plain(s) => s.to_string(), // TODO: Escape + DiscordComponent::Literal(c) => c.to_string(), + DiscordComponent::Link(target) => format!(r#"{0}"#, target), + + DiscordComponent::Bold(children) => { + format!("{}", children.to_html()) + } + DiscordComponent::Italic(children) => { + format!("{}", children.to_html()) + } + DiscordComponent::Strikethrough(children) => { + format!("{}", children.to_html()) + } + DiscordComponent::Underline(children) => { + format!("{}", children.to_html()) + } + + DiscordComponent::Code(source) => format!("{}", source), + DiscordComponent::CodeBlock { lang, source } => { + let language_class = lang + .map(|l| " class=\"language-".to_owned() + l + "\"") + .unwrap_or_else(String::new); + + format!("
{}
", language_class, source) + } + + DiscordComponent::Spoiler(children) => { + format!("{}", children.to_html()) + } + + DiscordComponent::LineBreak => "
".to_string(), + DiscordComponent::Quote(children) => { + format!("
{}
", children.to_html()) + } + } + } +} + +impl<'a> ToHtml for Vec> { + fn to_html(&self) -> String { + self.iter().map(|c| c.to_html()).collect() + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..7594541 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,13 @@ +mod ast; + +mod convert; + +mod parse_basic; +mod parse_code; +mod parse_inline_style; +mod parse_link; +mod parse_quotes; + +pub use ast::DiscordComponent; +pub use convert::ToHtml; +pub use parse_basic::parse; diff --git a/src/parse_basic.rs b/src/parse_basic.rs new file mode 100644 index 0000000..61ca8ce --- /dev/null +++ b/src/parse_basic.rs @@ -0,0 +1,87 @@ +use super::{ + ast::DiscordComponent, parse_code::*, parse_inline_style::*, parse_link::*, parse_quotes::*, +}; + +pub fn parse(text: &'_ str) -> Vec> { + let mut tokens = Vec::new(); + let mut working_plain_start: isize = -1; + let mut i = 0; + while i < text.len() { + let is_line_start = matches!(tokens.last(), None | Some(DiscordComponent::LineBreak)); + + match parse_token(&text[i..], is_line_start) { + Some((token, consumed)) => { + if working_plain_start >= 0 { + let plain_start = working_plain_start as usize; + tokens.push(DiscordComponent::Plain(&text[plain_start..i])); + working_plain_start = -1; + } + + tokens.push(token); + i += consumed; + } + + None => { + if working_plain_start < 0 { + working_plain_start = i as isize; + } + + let mut next_char = i + 1; + while !text.is_char_boundary(next_char) { + next_char += 1; + } + i = next_char; + } + } + } + + if working_plain_start >= 0 { + let plain_start = working_plain_start as usize; + tokens.push(DiscordComponent::Plain(&text[plain_start..])); + } + + tokens +} + +pub fn parse_token(text: &'_ str, is_line_start: bool) -> Option<(DiscordComponent<'_>, usize)> { + parse_escaped_literal(text) + .or_else(|| { + if is_line_start { + parse_quotes(text) + } else { + None + } + }) + .or_else(|| parse_code_block(text)) + .or_else(|| parse_code(text)) + .or_else(|| parse_link(text)) + .or_else(|| parse_bold(text)) + .or_else(|| parse_italic(text)) + .or_else(|| parse_strikethrough(text)) + .or_else(|| parse_underline(text)) + .or_else(|| parse_spoiler(text)) + .or_else(|| parse_line_break(text)) +} + +pub fn parse_escaped_literal(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + let mut chars = text.chars(); + if let Some('\\') = chars.next() { + return match chars.next() { + Some(x @ '\\') | Some(x @ '`') | Some(x @ '*') | Some(x @ '_') | Some(x @ '{') + | Some(x @ '}') | Some(x @ '[') | Some(x @ ']') | Some(x @ '(') | Some(x @ ')') + | Some(x @ '#') | Some(x @ '+') | Some(x @ '-') | Some(x @ '.') | Some(x @ '!') => { + Some((DiscordComponent::Literal(x), 2)) + } + _ => None, + }; + } + None +} + +pub fn parse_line_break(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + if text.starts_with('\n') { + return Some((DiscordComponent::LineBreak, 1)); + } + + None +} diff --git a/src/parse_code.rs b/src/parse_code.rs new file mode 100644 index 0000000..614a5ec --- /dev/null +++ b/src/parse_code.rs @@ -0,0 +1,36 @@ +use lazy_static::lazy_static; +use regex::Regex; + +use super::ast::DiscordComponent; + +pub fn parse_code_block(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + lazy_static! { + static ref CODE_BLOCK: Regex = + Regex::new(r"^```(?P.+)?\n(?P.*?)\n```").unwrap(); + } + + if let Some(caps) = CODE_BLOCK.captures(text) { + let lang = caps.name("lang").map(|m| m.as_str()); + let source = caps.name("source").unwrap().as_str(); + let whole_len = caps.get(0).unwrap().as_str().len(); + + return Some((DiscordComponent::CodeBlock { lang, source }, whole_len)); + } + + None +} + +pub fn parse_code(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + lazy_static! { + static ref CODE: Regex = Regex::new(r"^`(?P.+?)`").unwrap(); + } + + if let Some(caps) = CODE.captures(text) { + let inner = caps.name("inner").unwrap().as_str(); + let whole_len = caps.get(0).unwrap().as_str().len(); + + return Some((DiscordComponent::Code(inner), whole_len)); + } + + None +} diff --git a/src/parse_inline_style.rs b/src/parse_inline_style.rs new file mode 100644 index 0000000..18e357a --- /dev/null +++ b/src/parse_inline_style.rs @@ -0,0 +1,84 @@ +use lazy_static::lazy_static; +use regex::Regex; + +use super::ast::DiscordComponent; +use super::parse_basic::parse; + +pub fn parse_bold(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + lazy_static! { + static ref BOLD: Regex = Regex::new(r"^\*\*(?P.+?)\*\*").unwrap(); + } + + if let Some(caps) = BOLD.captures(text) { + let inner = caps.name("inner").unwrap().as_str(); + let whole_len = caps.get(0).unwrap().as_str().len(); + + return Some((DiscordComponent::Bold(parse(inner)), whole_len)); + } + + None +} + +pub fn parse_italic(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + lazy_static! { + static ref ITALIC_UNDERSCORE: Regex = Regex::new(r"^_(?P.+?)_").unwrap(); + static ref ITALIC_ASTERISK: Regex = Regex::new(r"^\*(?P.+?)\*").unwrap(); + } + + if let Some(caps) = ITALIC_UNDERSCORE + .captures(text) + .or_else(|| ITALIC_ASTERISK.captures(text)) + { + let inner = caps.name("inner").unwrap().as_str(); + let whole_len = caps.get(0).unwrap().as_str().len(); + + return Some((DiscordComponent::Italic(parse(inner)), whole_len)); + } + + None +} + +pub fn parse_strikethrough(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + lazy_static! { + static ref STRIKETHROUGH: Regex = Regex::new(r"^~~(?P.+?)~~").unwrap(); + } + + if let Some(caps) = STRIKETHROUGH.captures(text) { + let inner = caps.name("inner").unwrap().as_str(); + let whole_len = caps.get(0).unwrap().as_str().len(); + + return Some((DiscordComponent::Strikethrough(parse(inner)), whole_len)); + } + + None +} + +pub fn parse_underline(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + lazy_static! { + static ref UNDERLINE: Regex = Regex::new(r"^__(?P.+?)__").unwrap(); + } + + if let Some(caps) = UNDERLINE.captures(text) { + let inner = caps.name("inner").unwrap().as_str(); + let whole_len = caps.get(0).unwrap().as_str().len(); + + return Some((DiscordComponent::Underline(parse(inner)), whole_len)); + } + + None +} + +pub fn parse_spoiler(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + lazy_static! { + static ref SPOILER: Regex = Regex::new(r"^\|\|(?P.+?)\|\|").unwrap(); + } + + if let Some(caps) = SPOILER.captures(text) { + let inner = caps.name("inner").unwrap().as_str(); + let whole_len = caps.get(0).unwrap().as_str().len(); + + return Some((DiscordComponent::Spoiler(parse(inner)), whole_len)); + } + + None +} diff --git a/src/parse_link.rs b/src/parse_link.rs new file mode 100644 index 0000000..8e22d5e --- /dev/null +++ b/src/parse_link.rs @@ -0,0 +1,37 @@ +use lazy_static::lazy_static; +use regex::Regex; + +use super::ast::DiscordComponent; + +pub fn parse_link(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + lazy_static! { + static ref LINK: Regex = + Regex::new(r#"^((?:https?|steam)://[^\s<]+[^<.,:;"'\]\s])"#).unwrap(); + } + + if let Some(caps) = LINK.captures(text) { + let mut link_range = caps.get(0).unwrap().range(); + let orig_link = &text[link_range.clone()]; + + if orig_link.ends_with(')') { + let mut bracket_balance: i16 = 0; + for c in orig_link.chars() { + if c == '(' { + bracket_balance -= 1; + } else if c == ')' { + bracket_balance += 1; + } + } + + if bracket_balance > 0 { + link_range = link_range.start..(link_range.end - 1); + } + } + + let link = &text[link_range]; + return Some((DiscordComponent::Link(link), link.len())); + } + + // TODO(Charlotte): Parse links + None +} diff --git a/src/parse_quotes.rs b/src/parse_quotes.rs new file mode 100644 index 0000000..f8d621b --- /dev/null +++ b/src/parse_quotes.rs @@ -0,0 +1,49 @@ +use lazy_static::lazy_static; +use regex::Regex; + +use super::ast::DiscordComponent; +use super::parse_basic::parse; + +pub fn parse_quotes(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> { + lazy_static! { + static ref QUOTE: Regex = Regex::new(r"^> (?P.*?)(?P\n|$)").unwrap(); + } + static MULTIQUOTE_PREFIX: &str = ">>> "; + + if text.starts_with(MULTIQUOTE_PREFIX) { + return Some(( + DiscordComponent::Quote(parse(text.strip_prefix(MULTIQUOTE_PREFIX).unwrap())), + text.len(), + )); + } + + if let Some(caps) = QUOTE.captures(text) { + let body = caps.name("body").unwrap().as_str(); + let mut endl = caps.name("endl"); + + let mut whole_len = caps.get(0).unwrap().as_str().len(); + let mut body_components = parse(body); + + while let Some(next_line_caps) = QUOTE.captures(&text[whole_len..]) { + dbg!(&next_line_caps); + + let next_line_body = next_line_caps.name("body").unwrap().as_str(); + endl = next_line_caps.name("endl"); + + let mut next_body_components = parse(next_line_body); + body_components.push(DiscordComponent::LineBreak); + body_components.append(&mut next_body_components); + + whole_len += next_line_caps.get(0).unwrap().as_str().len(); + } + + let endl_len = endl.map(|m| m.as_str().len()).unwrap_or(0); + + return Some(( + DiscordComponent::Quote(body_components), + whole_len - endl_len, + )); + } + + None +}