diff --git a/Cargo.lock b/Cargo.lock index 62dd0cf..42ad4d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,6 +30,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "anstream" version = "0.6.14" @@ -378,6 +387,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", + "futures-sink", ] [[package]] @@ -386,6 +396,12 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + [[package]] name = "futures-sink" version = "0.3.30" @@ -405,9 +421,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ "futures-core", + "futures-io", + "futures-sink", "futures-task", + "memchr", "pin-project-lite", "pin-utils", + "slab", ] [[package]] @@ -1177,6 +1197,35 @@ dependencies = [ "bitflags 2.5.0", ] +[[package]] +name = "regex" +version = "1.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + [[package]] name = "reqwest" version = "0.12.5" @@ -1186,6 +1235,7 @@ dependencies = [ "base64", "bytes", "encoding_rs", + "futures-channel", "futures-core", "futures-util", "h2", @@ -1668,6 +1718,7 @@ version = "0.1.0" dependencies = [ "clap", "dotenv", + "regex", "reqwest", "rusqlite", "scraper", diff --git a/Cargo.toml b/Cargo.toml index 76e835d..53469a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ edition = "2021" [dependencies] clap = { version = "4.5.7", features = ["derive"] } dotenv = "0.15.0" -reqwest = "0.12.5" +regex = "1.10.5" +reqwest = { version = "0.12.5", features = ["blocking"] } rusqlite = { version = "0.31.0", features = ["bundled"] } scraper = "0.19.0" diff --git a/src/main.rs b/src/main.rs index 6364873..11d3e8c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,10 @@ -use std::{env, fmt::Debug, fs::{self, DirEntry}, path::Path}; +use std::{fmt::Debug, fs::{self, DirEntry}, path::Path}; use bencode::{decode, Value}; use clap::{Parser, Subcommand}; +use reqwest::StatusCode; use rusqlite::Connection; -use dotenv::dotenv; +use scraper::{Html, Selector}; +use regex::Regex; mod bencode; @@ -226,13 +228,32 @@ fn scrape(db: Connection, destination: &String) { .expect("query_map") .filter_map(|f| f.ok()); + let forum_id_re = Regex::new(".+f=(\\d+)").unwrap(); for torrent in torrents { - dbg!(torrent.id, torrent.publisher_url); + let response = reqwest::blocking::get(torrent.publisher_url).unwrap(); + if response.status() != StatusCode::OK { + eprintln!("torrent {} request error", torrent.id); + } + + let document = Html::parse_document(&response.text().unwrap()); + let selector = Selector::parse("td.nav").unwrap(); + let selected = document.select(&selector); + + let topics = selected.into_iter().nth(0).unwrap(); + let topic = topics.children().nth(5).unwrap(); + let forum_link = topic.value().as_element().unwrap().attr("href").unwrap(); + + let forum_id: i64 = str::parse(forum_id_re + .captures(forum_link) + .unwrap() + .get(1) + .unwrap() + .as_str() + ).unwrap(); } } fn main() { - dotenv().ok(); let args = Args::parse(); let db = Connection::open(args.db_path).unwrap();