implement forum id extraction

This commit is contained in:
mykola2312 2024-06-18 22:47:39 +03:00
parent 9709d929ea
commit 0434503290
3 changed files with 78 additions and 5 deletions

51
Cargo.lock generated
View file

@ -30,6 +30,15 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "anstream"
version = "0.6.14"
@ -378,6 +387,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
@ -386,6 +396,12 @@ version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
[[package]]
name = "futures-io"
version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
[[package]]
name = "futures-sink"
version = "0.3.30"
@ -405,9 +421,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
dependencies = [
"futures-core",
"futures-io",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"slab",
]
[[package]]
@ -1177,6 +1197,35 @@ dependencies = [
"bitflags 2.5.0",
]
[[package]]
name = "regex"
version = "1.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
[[package]]
name = "reqwest"
version = "0.12.5"
@ -1186,6 +1235,7 @@ dependencies = [
"base64",
"bytes",
"encoding_rs",
"futures-channel",
"futures-core",
"futures-util",
"h2",
@ -1668,6 +1718,7 @@ version = "0.1.0"
dependencies = [
"clap",
"dotenv",
"regex",
"reqwest",
"rusqlite",
"scraper",

View file

@ -8,6 +8,7 @@ edition = "2021"
[dependencies]
clap = { version = "4.5.7", features = ["derive"] }
dotenv = "0.15.0"
reqwest = "0.12.5"
regex = "1.10.5"
reqwest = { version = "0.12.5", features = ["blocking"] }
rusqlite = { version = "0.31.0", features = ["bundled"] }
scraper = "0.19.0"

View file

@ -1,8 +1,10 @@
use std::{env, fmt::Debug, fs::{self, DirEntry}, path::Path};
use std::{fmt::Debug, fs::{self, DirEntry}, path::Path};
use bencode::{decode, Value};
use clap::{Parser, Subcommand};
use reqwest::StatusCode;
use rusqlite::Connection;
use dotenv::dotenv;
use scraper::{Html, Selector};
use regex::Regex;
mod bencode;
@ -226,13 +228,32 @@ fn scrape(db: Connection, destination: &String) {
.expect("query_map")
.filter_map(|f| f.ok());
let forum_id_re = Regex::new(".+f=(\\d+)").unwrap();
for torrent in torrents {
dbg!(torrent.id, torrent.publisher_url);
let response = reqwest::blocking::get(torrent.publisher_url).unwrap();
if response.status() != StatusCode::OK {
eprintln!("torrent {} request error", torrent.id);
}
let document = Html::parse_document(&response.text().unwrap());
let selector = Selector::parse("td.nav").unwrap();
let selected = document.select(&selector);
let topics = selected.into_iter().nth(0).unwrap();
let topic = topics.children().nth(5).unwrap();
let forum_link = topic.value().as_element().unwrap().attr("href").unwrap();
let forum_id: i64 = str::parse(forum_id_re
.captures(forum_link)
.unwrap()
.get(1)
.unwrap()
.as_str()
).unwrap();
}
}
fn main() {
dotenv().ok();
let args = Args::parse();
let db = Connection::open(args.db_path).unwrap();