From df1d3fc05bbe2a1e398e3459a0ba7cfaec689751 Mon Sep 17 00:00:00 2001 From: mykola2312 <49044616+mykola2312@users.noreply.github.com> Date: Sat, 24 Feb 2024 14:32:08 +0200 Subject: [PATCH] write function to find all urls in text --- Cargo.lock | 39 +++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/bot.rs | 1 + src/bot/bot.rs | 1 - src/bot/sanitize.rs | 12 ++++++++++++ src/main.rs | 2 +- 6 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 src/bot/sanitize.rs diff --git a/Cargo.lock b/Cargo.lock index ca15c96..1e11d37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,6 +30,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + [[package]] name = "anyhow" version = "1.0.75" @@ -654,6 +663,7 @@ dependencies = [ "anyhow", "dotenv", "ordered-float", + "regex", "serde", "serde_json", "teloxide", @@ -869,6 +879,35 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "regex" +version = "1.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + [[package]] name = "reqwest" version = "0.11.22" diff --git a/Cargo.toml b/Cargo.toml index 7adb4f3..34f5b2f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,3 +13,4 @@ teloxide = { version = "0.12.2", git ="https://github.com/teloxide/teloxide", fe serde = { version = "1.0.196", features = ["derive"] } serde_json = "1.0.113" ordered-float = "4.2.0" +regex = "1.10.3" diff --git a/src/bot.rs b/src/bot.rs index 1cbfd96..283a1a5 100644 --- a/src/bot.rs +++ b/src/bot.rs @@ -1 +1,2 @@ pub mod bot; +pub mod sanitize; diff --git a/src/bot/bot.rs b/src/bot/bot.rs index de9c895..81ea88a 100644 --- a/src/bot/bot.rs +++ b/src/bot/bot.rs @@ -104,6 +104,5 @@ async fn cmd_download(bot: Bot, msg: Message, url: String) -> HandlerResult { } async fn handle_message(_bot: Bot, _dialogue: MyDialogue, msg: Message) -> HandlerResult { - Ok(()) } diff --git a/src/bot/sanitize.rs b/src/bot/sanitize.rs new file mode 100644 index 0000000..b6ab633 --- /dev/null +++ b/src/bot/sanitize.rs @@ -0,0 +1,12 @@ +use regex::Regex; + +// https://stackoverflow.com/questions/6038061/regular-expression-to-find-urls-within-a-string +const RE_URL: &str = + r"(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])"; + +pub fn extract_urls(text: &str) -> Vec<&str> { + let re = Regex::new(RE_URL).unwrap(); + re.find_iter(text) + .map(|m| m.as_str()) + .collect::>() +} diff --git a/src/main.rs b/src/main.rs index 5dd7fdc..563b3eb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,7 +8,7 @@ mod dl; #[tokio::main] async fn main() -> anyhow::Result<()> { dotenv().ok(); - + bot_main().await?; Ok(()) }