implement forum id extraction
This commit is contained in:
parent
9709d929ea
commit
0434503290
3 changed files with 78 additions and 5 deletions
51
Cargo.lock
generated
51
Cargo.lock
generated
|
|
@ -30,6 +30,15 @@ dependencies = [
|
|||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.14"
|
||||
|
|
@ -378,6 +387,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -386,6 +396,12 @@ version = "0.3.30"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.30"
|
||||
|
|
@ -405,9 +421,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1177,6 +1197,35 @@ dependencies = [
|
|||
"bitflags 2.5.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.12.5"
|
||||
|
|
@ -1186,6 +1235,7 @@ dependencies = [
|
|||
"base64",
|
||||
"bytes",
|
||||
"encoding_rs",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"h2",
|
||||
|
|
@ -1668,6 +1718,7 @@ version = "0.1.0"
|
|||
dependencies = [
|
||||
"clap",
|
||||
"dotenv",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"rusqlite",
|
||||
"scraper",
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ edition = "2021"
|
|||
[dependencies]
|
||||
clap = { version = "4.5.7", features = ["derive"] }
|
||||
dotenv = "0.15.0"
|
||||
reqwest = "0.12.5"
|
||||
regex = "1.10.5"
|
||||
reqwest = { version = "0.12.5", features = ["blocking"] }
|
||||
rusqlite = { version = "0.31.0", features = ["bundled"] }
|
||||
scraper = "0.19.0"
|
||||
|
|
|
|||
29
src/main.rs
29
src/main.rs
|
|
@ -1,8 +1,10 @@
|
|||
use std::{env, fmt::Debug, fs::{self, DirEntry}, path::Path};
|
||||
use std::{fmt::Debug, fs::{self, DirEntry}, path::Path};
|
||||
use bencode::{decode, Value};
|
||||
use clap::{Parser, Subcommand};
|
||||
use reqwest::StatusCode;
|
||||
use rusqlite::Connection;
|
||||
use dotenv::dotenv;
|
||||
use scraper::{Html, Selector};
|
||||
use regex::Regex;
|
||||
|
||||
mod bencode;
|
||||
|
||||
|
|
@ -226,13 +228,32 @@ fn scrape(db: Connection, destination: &String) {
|
|||
.expect("query_map")
|
||||
.filter_map(|f| f.ok());
|
||||
|
||||
let forum_id_re = Regex::new(".+f=(\\d+)").unwrap();
|
||||
for torrent in torrents {
|
||||
dbg!(torrent.id, torrent.publisher_url);
|
||||
let response = reqwest::blocking::get(torrent.publisher_url).unwrap();
|
||||
if response.status() != StatusCode::OK {
|
||||
eprintln!("torrent {} request error", torrent.id);
|
||||
}
|
||||
|
||||
let document = Html::parse_document(&response.text().unwrap());
|
||||
let selector = Selector::parse("td.nav").unwrap();
|
||||
let selected = document.select(&selector);
|
||||
|
||||
let topics = selected.into_iter().nth(0).unwrap();
|
||||
let topic = topics.children().nth(5).unwrap();
|
||||
let forum_link = topic.value().as_element().unwrap().attr("href").unwrap();
|
||||
|
||||
let forum_id: i64 = str::parse(forum_id_re
|
||||
.captures(forum_link)
|
||||
.unwrap()
|
||||
.get(1)
|
||||
.unwrap()
|
||||
.as_str()
|
||||
).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
dotenv().ok();
|
||||
let args = Args::parse();
|
||||
let db = Connection::open(args.db_path).unwrap();
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue