Initial commit

2021-08-07 10:34:53 +01:00 · 2021-08-07 10:34:53 +01:00 · bbf120f753
commit bbf120f753
12 changed files with 448 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/target
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,49 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "aho-corasick"
+version = "0.7.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "discord_message_format"
+version = "0.1.0"
+dependencies = [
+ "lazy_static",
+ "regex",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "memchr"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
+
+[[package]]
+name = "regex"
+version = "1.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,8 @@
+[package]
+name = "discord_message_format"
+version = "0.1.0"
+edition = "2018"
+
+[dependencies]
+lazy_static = "1.4.0"
+regex = "1.5.4"
--- a/README.md
+++ b/README.md
@ -0,0 +1,10 @@
+# discord_message_format
+
+AST Construction for the format that Discord use for messages.
+
+This lets you be formatting-aware when you parse user messages when writing bots.
+
+## To Do
+
+- User/role mentions
+- Emoji
--- a/src/ast.rs
+++ b/src/ast.rs
@ -0,0 +1,22 @@
+#[derive(Debug)]
+pub enum DiscordComponent<'a> {
+    Plain(&'a str),
+    Literal(char),
+    Link(&'a str),
+
+    Bold(Vec<DiscordComponent<'a>>),
+    Italic(Vec<DiscordComponent<'a>>),
+    Strikethrough(Vec<DiscordComponent<'a>>),
+    Underline(Vec<DiscordComponent<'a>>),
+
+    Code(&'a str),
+    CodeBlock {
+        lang: Option<&'a str>,
+        source: &'a str,
+    },
+
+    Spoiler(Vec<DiscordComponent<'a>>),
+
+    LineBreak,
+    Quote(Vec<DiscordComponent<'a>>),
+}
--- a/src/convert.rs
+++ b/src/convert.rs
@ -0,0 +1,52 @@
+use super::DiscordComponent;
+
+pub trait ToHtml {
+    fn to_html(&self) -> String;
+}
+
+impl<'a> ToHtml for DiscordComponent<'a> {
+    fn to_html(&self) -> String {
+        match self {
+            DiscordComponent::Plain(s) => s.to_string(), // TODO: Escape
+            DiscordComponent::Literal(c) => c.to_string(),
+            DiscordComponent::Link(target) => format!(r#"<a href="{0}">{0}</a>"#, target),
+
+            DiscordComponent::Bold(children) => {
+                format!("<strong>{}</strong>", children.to_html())
+            }
+            DiscordComponent::Italic(children) => {
+                format!("<em>{}</em>", children.to_html())
+            }
+            DiscordComponent::Strikethrough(children) => {
+                format!("<del>{}</del>", children.to_html())
+            }
+            DiscordComponent::Underline(children) => {
+                format!("<u>{}</u>", children.to_html())
+            }
+
+            DiscordComponent::Code(source) => format!("<code>{}</code>", source),
+            DiscordComponent::CodeBlock { lang, source } => {
+                let language_class = lang
+                    .map(|l| " class=\"language-".to_owned() + l + "\"")
+                    .unwrap_or_else(String::new);
+
+                format!("<pre><code{}>{}</code></pre>", language_class, source)
+            }
+
+            DiscordComponent::Spoiler(children) => {
+                format!("<span data-mx-spoiler>{}</span>", children.to_html())
+            }
+
+            DiscordComponent::LineBreak => "<br>".to_string(),
+            DiscordComponent::Quote(children) => {
+                format!("<blockquote>{}</blockquote>", children.to_html())
+            }
+        }
+    }
+}
+
+impl<'a> ToHtml for Vec<DiscordComponent<'a>> {
+    fn to_html(&self) -> String {
+        self.iter().map(|c| c.to_html()).collect()
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,13 @@
+mod ast;
+
+mod convert;
+
+mod parse_basic;
+mod parse_code;
+mod parse_inline_style;
+mod parse_link;
+mod parse_quotes;
+
+pub use ast::DiscordComponent;
+pub use convert::ToHtml;
+pub use parse_basic::parse;
--- a/src/parse_basic.rs
+++ b/src/parse_basic.rs
@ -0,0 +1,87 @@
+use super::{
+    ast::DiscordComponent, parse_code::*, parse_inline_style::*, parse_link::*, parse_quotes::*,
+};
+
+pub fn parse(text: &'_ str) -> Vec<DiscordComponent<'_>> {
+    let mut tokens = Vec::new();
+    let mut working_plain_start: isize = -1;
+    let mut i = 0;
+    while i < text.len() {
+        let is_line_start = matches!(tokens.last(), None | Some(DiscordComponent::LineBreak));
+
+        match parse_token(&text[i..], is_line_start) {
+            Some((token, consumed)) => {
+                if working_plain_start >= 0 {
+                    let plain_start = working_plain_start as usize;
+                    tokens.push(DiscordComponent::Plain(&text[plain_start..i]));
+                    working_plain_start = -1;
+                }
+
+                tokens.push(token);
+                i += consumed;
+            }
+
+            None => {
+                if working_plain_start < 0 {
+                    working_plain_start = i as isize;
+                }
+
+                let mut next_char = i + 1;
+                while !text.is_char_boundary(next_char) {
+                    next_char += 1;
+                }
+                i = next_char;
+            }
+        }
+    }
+
+    if working_plain_start >= 0 {
+        let plain_start = working_plain_start as usize;
+        tokens.push(DiscordComponent::Plain(&text[plain_start..]));
+    }
+
+    tokens
+}
+
+pub fn parse_token(text: &'_ str, is_line_start: bool) -> Option<(DiscordComponent<'_>, usize)> {
+    parse_escaped_literal(text)
+        .or_else(|| {
+            if is_line_start {
+                parse_quotes(text)
+            } else {
+                None
+            }
+        })
+        .or_else(|| parse_code_block(text))
+        .or_else(|| parse_code(text))
+        .or_else(|| parse_link(text))
+        .or_else(|| parse_bold(text))
+        .or_else(|| parse_italic(text))
+        .or_else(|| parse_strikethrough(text))
+        .or_else(|| parse_underline(text))
+        .or_else(|| parse_spoiler(text))
+        .or_else(|| parse_line_break(text))
+}
+
+pub fn parse_escaped_literal(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    let mut chars = text.chars();
+    if let Some('\\') = chars.next() {
+        return match chars.next() {
+            Some(x @ '\\') | Some(x @ '`') | Some(x @ '*') | Some(x @ '_') | Some(x @ '{')
+            | Some(x @ '}') | Some(x @ '[') | Some(x @ ']') | Some(x @ '(') | Some(x @ ')')
+            | Some(x @ '#') | Some(x @ '+') | Some(x @ '-') | Some(x @ '.') | Some(x @ '!') => {
+                Some((DiscordComponent::Literal(x), 2))
+            }
+            _ => None,
+        };
+    }
+    None
+}
+
+pub fn parse_line_break(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    if text.starts_with('\n') {
+        return Some((DiscordComponent::LineBreak, 1));
+    }
+
+    None
+}
--- a/src/parse_code.rs
+++ b/src/parse_code.rs
@ -0,0 +1,36 @@
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use super::ast::DiscordComponent;
+
+pub fn parse_code_block(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    lazy_static! {
+        static ref CODE_BLOCK: Regex =
+            Regex::new(r"^```(?P<lang>.+)?\n(?P<source>.*?)\n```").unwrap();
+    }
+
+    if let Some(caps) = CODE_BLOCK.captures(text) {
+        let lang = caps.name("lang").map(|m| m.as_str());
+        let source = caps.name("source").unwrap().as_str();
+        let whole_len = caps.get(0).unwrap().as_str().len();
+
+        return Some((DiscordComponent::CodeBlock { lang, source }, whole_len));
+    }
+
+    None
+}
+
+pub fn parse_code(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    lazy_static! {
+        static ref CODE: Regex = Regex::new(r"^`(?P<inner>.+?)`").unwrap();
+    }
+
+    if let Some(caps) = CODE.captures(text) {
+        let inner = caps.name("inner").unwrap().as_str();
+        let whole_len = caps.get(0).unwrap().as_str().len();
+
+        return Some((DiscordComponent::Code(inner), whole_len));
+    }
+
+    None
+}
--- a/src/parse_inline_style.rs
+++ b/src/parse_inline_style.rs
@ -0,0 +1,84 @@
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use super::ast::DiscordComponent;
+use super::parse_basic::parse;
+
+pub fn parse_bold(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    lazy_static! {
+        static ref BOLD: Regex = Regex::new(r"^\*\*(?P<inner>.+?)\*\*").unwrap();
+    }
+
+    if let Some(caps) = BOLD.captures(text) {
+        let inner = caps.name("inner").unwrap().as_str();
+        let whole_len = caps.get(0).unwrap().as_str().len();
+
+        return Some((DiscordComponent::Bold(parse(inner)), whole_len));
+    }
+
+    None
+}
+
+pub fn parse_italic(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    lazy_static! {
+        static ref ITALIC_UNDERSCORE: Regex = Regex::new(r"^_(?P<inner>.+?)_").unwrap();
+        static ref ITALIC_ASTERISK: Regex = Regex::new(r"^\*(?P<inner>.+?)\*").unwrap();
+    }
+
+    if let Some(caps) = ITALIC_UNDERSCORE
+        .captures(text)
+        .or_else(|| ITALIC_ASTERISK.captures(text))
+    {
+        let inner = caps.name("inner").unwrap().as_str();
+        let whole_len = caps.get(0).unwrap().as_str().len();
+
+        return Some((DiscordComponent::Italic(parse(inner)), whole_len));
+    }
+
+    None
+}
+
+pub fn parse_strikethrough(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    lazy_static! {
+        static ref STRIKETHROUGH: Regex = Regex::new(r"^~~(?P<inner>.+?)~~").unwrap();
+    }
+
+    if let Some(caps) = STRIKETHROUGH.captures(text) {
+        let inner = caps.name("inner").unwrap().as_str();
+        let whole_len = caps.get(0).unwrap().as_str().len();
+
+        return Some((DiscordComponent::Strikethrough(parse(inner)), whole_len));
+    }
+
+    None
+}
+
+pub fn parse_underline(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    lazy_static! {
+        static ref UNDERLINE: Regex = Regex::new(r"^__(?P<inner>.+?)__").unwrap();
+    }
+
+    if let Some(caps) = UNDERLINE.captures(text) {
+        let inner = caps.name("inner").unwrap().as_str();
+        let whole_len = caps.get(0).unwrap().as_str().len();
+
+        return Some((DiscordComponent::Underline(parse(inner)), whole_len));
+    }
+
+    None
+}
+
+pub fn parse_spoiler(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    lazy_static! {
+        static ref SPOILER: Regex = Regex::new(r"^\|\|(?P<inner>.+?)\|\|").unwrap();
+    }
+
+    if let Some(caps) = SPOILER.captures(text) {
+        let inner = caps.name("inner").unwrap().as_str();
+        let whole_len = caps.get(0).unwrap().as_str().len();
+
+        return Some((DiscordComponent::Spoiler(parse(inner)), whole_len));
+    }
+
+    None
+}
--- a/src/parse_link.rs
+++ b/src/parse_link.rs
@ -0,0 +1,37 @@
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use super::ast::DiscordComponent;
+
+pub fn parse_link(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    lazy_static! {
+        static ref LINK: Regex =
+            Regex::new(r#"^((?:https?|steam)://[^\s<]+[^<.,:;"'\]\s])"#).unwrap();
+    }
+
+    if let Some(caps) = LINK.captures(text) {
+        let mut link_range = caps.get(0).unwrap().range();
+        let orig_link = &text[link_range.clone()];
+
+        if orig_link.ends_with(')') {
+            let mut bracket_balance: i16 = 0;
+            for c in orig_link.chars() {
+                if c == '(' {
+                    bracket_balance -= 1;
+                } else if c == ')' {
+                    bracket_balance += 1;
+                }
+            }
+
+            if bracket_balance > 0 {
+                link_range = link_range.start..(link_range.end - 1);
+            }
+        }
+
+        let link = &text[link_range];
+        return Some((DiscordComponent::Link(link), link.len()));
+    }
+
+    // TODO(Charlotte): Parse links
+    None
+}
--- a/src/parse_quotes.rs
+++ b/src/parse_quotes.rs
@ -0,0 +1,49 @@
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use super::ast::DiscordComponent;
+use super::parse_basic::parse;
+
+pub fn parse_quotes(text: &'_ str) -> Option<(DiscordComponent<'_>, usize)> {
+    lazy_static! {
+        static ref QUOTE: Regex = Regex::new(r"^> (?P<body>.*?)(?P<endl>\n|$)").unwrap();
+    }
+    static MULTIQUOTE_PREFIX: &str = ">>> ";
+
+    if text.starts_with(MULTIQUOTE_PREFIX) {
+        return Some((
+            DiscordComponent::Quote(parse(text.strip_prefix(MULTIQUOTE_PREFIX).unwrap())),
+            text.len(),
+        ));
+    }
+
+    if let Some(caps) = QUOTE.captures(text) {
+        let body = caps.name("body").unwrap().as_str();
+        let mut endl = caps.name("endl");
+
+        let mut whole_len = caps.get(0).unwrap().as_str().len();
+        let mut body_components = parse(body);
+
+        while let Some(next_line_caps) = QUOTE.captures(&text[whole_len..]) {
+            dbg!(&next_line_caps);
+
+            let next_line_body = next_line_caps.name("body").unwrap().as_str();
+            endl = next_line_caps.name("endl");
+
+            let mut next_body_components = parse(next_line_body);
+            body_components.push(DiscordComponent::LineBreak);
+            body_components.append(&mut next_body_components);
+
+            whole_len += next_line_caps.get(0).unwrap().as_str().len();
+        }
+
+        let endl_len = endl.map(|m| m.as_str().len()).unwrap_or(0);
+
+        return Some((
+            DiscordComponent::Quote(body_components),
+            whole_len - endl_len,
+        ));
+    }
+
+    None
+}