From 3e48a9955a482a87a8e723aad9b23428a2d88484 Mon Sep 17 00:00:00 2001 From: koshiro <163620373+k0shir0@users.noreply.github.com> Date: Sat, 20 Sep 2025 17:03:57 -0500 Subject: [PATCH] Update scrape.py Added 2 threads for faster scanning live monitoring option for chat Data should save ever 100 messages for scrapeback mode if you need to keyboard interrupt with cntrl c --- scrape.py | 249 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 180 insertions(+), 69 deletions(-) diff --git a/scrape.py b/scrape.py index 638a835..3f05544 100644 --- a/scrape.py +++ b/scrape.py @@ -1,8 +1,10 @@ import requests import json import time +import threading from datetime import datetime, timedelta, timezone import re +from queue import Queue # --- User input --- TOKEN = input("Enter your Discord user token: ") @@ -11,107 +13,202 @@ HEADERS = { "Content-Type": "application/json" } -DISCORD_EPOCH = 1420070400000 # Discord epoch in milliseconds +DISCORD_EPOCH = 1420070400000 # Discord epoch in ms +SAVE_INTERVAL = 200 # save every N messages +# --- Utility Functions --- def datetime_to_snowflake(dt): timestamp_ms = int(dt.timestamp() * 1000) - snowflake = (timestamp_ms - DISCORD_EPOCH) << 22 - return snowflake + return str((timestamp_ms - DISCORD_EPOCH) << 22) +def parse_time_frame(time_frame): + now = datetime.now(timezone.utc) + num = int(time_frame[:-1]) + unit = time_frame[-1].lower() + if unit == 'h': + delta = timedelta(hours=num) + elif unit == 'd': + delta = timedelta(days=num) + elif unit == 'm': + delta = timedelta(days=num * 30) + else: + delta = timedelta(days=1) + return now - delta + +def sanitize_filename(name): + return re.sub(r'[^A-Za-z0-9_\-]', '_', name) + +def save_message_to_json(filename, msg): + try: + data = [] + try: + with open(filename, "r", encoding="utf-8") as f: + data = json.load(f) + except FileNotFoundError: + pass + # Prevent duplicates + if msg["id"] not in [m["id"] for m in data]: + data.append({ + "id": msg["id"], + "username": msg["author"]["username"], + "message": msg.get("content", ""), + "img": msg["attachments"][0]["url"] if msg.get("attachments") else None, + "time_sent": msg["timestamp"] + }) + with open(filename, "w", encoding="utf-8") as f: + json.dump(data, f, indent=4) + except Exception as e: + print(f"[-] Error saving message: {e}") + +# --- Server & DM Listing --- def list_dms(): - response = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS) - dms = response.json() + r = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS) + dms = r.json() for i, dm in enumerate(dms): recipient = dm['recipients'][0]['username'] if dm['type'] == 1 else "Group DM" print(f"{i}: {recipient}") return dms def list_servers(): - response = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS) - guilds = response.json() - for i, guild in enumerate(guilds): - print(f"{i}: {guild['name']}") + r = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS) + guilds = r.json() + for i, g in enumerate(guilds): + print(f"{i}: {g['name']}") return guilds def list_channels(guild_id): - response = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS) - channels = response.json() - for i, channel in enumerate(channels): - print(f"{i}: {channel['name']} ({channel['id']})") - return channels + r = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS) + chans = r.json() + for i, c in enumerate(chans): + print(f"{i}: {c['name']} ({c['id']})") + return chans -def parse_time_frame(time_frame): - now = datetime.now(timezone.utc) - num = int(time_frame[:-1]) - unit = time_frame[-1].lower() - - if unit == 'h': - delta = timedelta(hours=num) - elif unit == 'd': - delta = timedelta(days=num) - elif unit == 'm': - delta = timedelta(days=num*30) - else: - print("Invalid time format. Using 1 day as default.") - delta = timedelta(days=1) - - return now - delta - -def sanitize_filename(name): - """Remove emojis and keep only letters, numbers, underscores, and dashes.""" - return re.sub(r'[^A-Za-z0-9_\-]', '_', name) - -def scrape_messages(channel_id, time_frame, custom_name): +# --- Historical Scrape --- +def fetch_messages(channel_id, direction, start_id, out_queue): url = f"https://discord.com/api/v10/channels/{channel_id}/messages" - after_datetime = parse_time_frame(time_frame) - after_snowflake = str(datetime_to_snowflake(after_datetime)) - messages_data = [] - seen_ids = set() # Track message IDs to avoid duplicates - - params = {"after": after_snowflake, "limit": 100} + params = {"limit": 100} + params[direction] = start_id while True: response = requests.get(url, headers=HEADERS, params=params) if response.status_code == 429: retry_after = response.json().get("retry_after", 1) - print(f"Rate limited. Sleeping for {retry_after} seconds...") + print(f"Rate limited ({direction}). Sleeping {retry_after}s...") time.sleep(retry_after) continue if response.status_code != 200: - print(f"Error: {response.status_code} - {response.text}") + print(f"Error {direction}: {response.status_code} - {response.text}") break messages = response.json() if not messages: break + # always ordered newest -> oldest, so adjust pagination + if direction == "after": + params["after"] = messages[-1]["id"] + else: + params["before"] = messages[-1]["id"] + for msg in messages: - if msg['id'] in seen_ids: # Skip duplicates - continue - seen_ids.add(msg['id']) - entry = { - "username": msg['author']['username'], - "message": msg.get('content', ''), - "img": msg['attachments'][0]['url'] if msg.get('attachments') else None, - "time_sent": msg['timestamp'] - } - messages_data.append(entry) + out_queue.put({ + "id": msg["id"], + "username": msg["author"]["username"], + "message": msg.get("content", ""), + "img": msg["attachments"][0]["url"] if msg.get("attachments") else None, + "time_sent": msg["timestamp"] + }) - params['after'] = messages[-1]['id'] - time.sleep(0.1) +def scrape_messages(channel_id, time_frame, custom_name): + after_datetime = parse_time_frame(time_frame) + after_snowflake = datetime_to_snowflake(after_datetime) - # Sort messages chronologically - messages_data.sort(key=lambda x: x['time_sent']) + out_queue = Queue() + threads = [] + # thread 1: forward from cutoff + t1 = threading.Thread(target=fetch_messages, args=(channel_id, "after", after_snowflake, out_queue)) + threads.append(t1) + + # thread 2: backward from now + t2 = threading.Thread(target=fetch_messages, args=(channel_id, "before", "999999999999999999", out_queue)) + threads.append(t2) + + for t in threads: + t.start() + + messages_data = {} + saved_count = 0 + + while any(t.is_alive() for t in threads) or not out_queue.empty(): + while not out_queue.empty(): + msg = out_queue.get() + if msg["id"] not in messages_data: # dedup by ID + messages_data[msg["id"]] = msg + + if len(messages_data) - saved_count >= SAVE_INTERVAL: + filename = sanitize_filename(custom_name) + ".json" + with open(filename, "w", encoding="utf-8") as f: + json.dump(sorted(messages_data.values(), key=lambda x: x["time_sent"]), f, indent=4) + saved_count = len(messages_data) + print(f"Progress: {saved_count} messages saved...") + + time.sleep(0.2) + + for t in threads: + t.join() + + # final save filename = sanitize_filename(custom_name) + ".json" - with open(filename, 'w', encoding='utf-8') as f: - json.dump(messages_data, f, indent=4) + with open(filename, "w", encoding="utf-8") as f: + json.dump(sorted(messages_data.values(), key=lambda x: x["time_sent"]), f, indent=4) - print(f"Scraped {len(messages_data)} messages. Saved to {filename}") + print(f"Finished: {len(messages_data)} messages scraped. Saved to {filename}") -# --- Main program --- +# --- Live Monitor --- +def live_monitor(channel_id): + filename = f"channel_{channel_id}_live.json" + last_message_id = None + print("\n[+] Live monitoring started. Press CTRL+C to stop.\n") + try: + while True: + url = f"https://discord.com/api/v10/channels/{channel_id}/messages?limit=50" + if last_message_id: + url += f"&after={last_message_id}" + + try: + r = requests.get(url, headers=HEADERS) + if r.status_code != 200: + print(f"[-] Error fetching messages: {r.status_code} {r.text}") + time.sleep(2) + continue + messages = r.json() + except Exception as e: + print(f"[-] Exception: {e}") + time.sleep(2) + continue + + messages = sorted(messages, key=lambda x: int(x["id"])) # oldest first + + for msg in messages: + author = msg["author"]["username"] + content = msg.get("content", "") + attachments = [a["url"] for a in msg.get("attachments", [])] + print(f"{author}: {content}") + if attachments: + for a in attachments: + print(f"[Attachment] {a}") + save_message_to_json(filename, msg) + last_message_id = msg["id"] # update to latest message + + time.sleep(2) + + except KeyboardInterrupt: + print("\n[!] Live monitoring stopped. Data saved.") + +# --- Main Program --- print("Choose an option:") print("1: DMs") print("2: Servers") @@ -119,19 +216,33 @@ choice = input("> ") if choice == "1": dms = list_dms() - dm_index = int(input("Choose a DM to scrape: ")) + dm_index = int(input("Choose a DM: ")) channel_id = dms[dm_index]['id'] - custom_name = input("Enter custom filename (without extension): ") + custom_name = input("Custom filename (no extension): ") + time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ") + scrape_messages(channel_id, time_frame, custom_name) + elif choice == "2": guilds = list_servers() guild_index = int(input("Choose a server: ")) channels = list_channels(guilds[guild_index]['id']) - channel_index = int(input("Choose a channel to scrape: ")) + channel_index = int(input("Choose a channel: ")) channel_id = channels[channel_index]['id'] - custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}" + + print("Choose mode:") + print("1: Live-Monitor") + print("2: ScrapeBack") + mode = input("> ") + + if mode == "1": + live_monitor(channel_id) + elif mode == "2": + custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}" + time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ") + scrape_messages(channel_id, time_frame, custom_name) + else: + print("Invalid choice") + exit() else: print("Invalid choice") exit() - -time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ") -scrape_messages(channel_id, time_frame, custom_name)