Update scrape.py

Added 2 threads for faster scanning 
live monitoring option for chat
Data should save ever 100 messages for scrapeback mode if you need to keyboard interrupt with cntrl c
This commit is contained in:
koshiro 2025-09-20 17:03:57 -05:00 committed by GitHub
parent ac9e02010b
commit 3e48a9955a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

235
scrape.py
View file

@ -1,8 +1,10 @@
import requests import requests
import json import json
import time import time
import threading
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
import re import re
from queue import Queue
# --- User input --- # --- User input ---
TOKEN = input("Enter your Discord user token: ") TOKEN = input("Enter your Discord user token: ")
@ -11,40 +13,18 @@ HEADERS = {
"Content-Type": "application/json" "Content-Type": "application/json"
} }
DISCORD_EPOCH = 1420070400000 # Discord epoch in milliseconds DISCORD_EPOCH = 1420070400000 # Discord epoch in ms
SAVE_INTERVAL = 200 # save every N messages
# --- Utility Functions ---
def datetime_to_snowflake(dt): def datetime_to_snowflake(dt):
timestamp_ms = int(dt.timestamp() * 1000) timestamp_ms = int(dt.timestamp() * 1000)
snowflake = (timestamp_ms - DISCORD_EPOCH) << 22 return str((timestamp_ms - DISCORD_EPOCH) << 22)
return snowflake
def list_dms():
response = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS)
dms = response.json()
for i, dm in enumerate(dms):
recipient = dm['recipients'][0]['username'] if dm['type'] == 1 else "Group DM"
print(f"{i}: {recipient}")
return dms
def list_servers():
response = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS)
guilds = response.json()
for i, guild in enumerate(guilds):
print(f"{i}: {guild['name']}")
return guilds
def list_channels(guild_id):
response = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS)
channels = response.json()
for i, channel in enumerate(channels):
print(f"{i}: {channel['name']} ({channel['id']})")
return channels
def parse_time_frame(time_frame): def parse_time_frame(time_frame):
now = datetime.now(timezone.utc) now = datetime.now(timezone.utc)
num = int(time_frame[:-1]) num = int(time_frame[:-1])
unit = time_frame[-1].lower() unit = time_frame[-1].lower()
if unit == 'h': if unit == 'h':
delta = timedelta(hours=num) delta = timedelta(hours=num)
elif unit == 'd': elif unit == 'd':
@ -52,66 +32,183 @@ def parse_time_frame(time_frame):
elif unit == 'm': elif unit == 'm':
delta = timedelta(days=num * 30) delta = timedelta(days=num * 30)
else: else:
print("Invalid time format. Using 1 day as default.")
delta = timedelta(days=1) delta = timedelta(days=1)
return now - delta return now - delta
def sanitize_filename(name): def sanitize_filename(name):
"""Remove emojis and keep only letters, numbers, underscores, and dashes."""
return re.sub(r'[^A-Za-z0-9_\-]', '_', name) return re.sub(r'[^A-Za-z0-9_\-]', '_', name)
def scrape_messages(channel_id, time_frame, custom_name): def save_message_to_json(filename, msg):
url = f"https://discord.com/api/v10/channels/{channel_id}/messages" try:
after_datetime = parse_time_frame(time_frame) data = []
after_snowflake = str(datetime_to_snowflake(after_datetime)) try:
messages_data = [] with open(filename, "r", encoding="utf-8") as f:
seen_ids = set() # Track message IDs to avoid duplicates data = json.load(f)
except FileNotFoundError:
pass
# Prevent duplicates
if msg["id"] not in [m["id"] for m in data]:
data.append({
"id": msg["id"],
"username": msg["author"]["username"],
"message": msg.get("content", ""),
"img": msg["attachments"][0]["url"] if msg.get("attachments") else None,
"time_sent": msg["timestamp"]
})
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4)
except Exception as e:
print(f"[-] Error saving message: {e}")
params = {"after": after_snowflake, "limit": 100} # --- Server & DM Listing ---
def list_dms():
r = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS)
dms = r.json()
for i, dm in enumerate(dms):
recipient = dm['recipients'][0]['username'] if dm['type'] == 1 else "Group DM"
print(f"{i}: {recipient}")
return dms
def list_servers():
r = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS)
guilds = r.json()
for i, g in enumerate(guilds):
print(f"{i}: {g['name']}")
return guilds
def list_channels(guild_id):
r = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS)
chans = r.json()
for i, c in enumerate(chans):
print(f"{i}: {c['name']} ({c['id']})")
return chans
# --- Historical Scrape ---
def fetch_messages(channel_id, direction, start_id, out_queue):
url = f"https://discord.com/api/v10/channels/{channel_id}/messages"
params = {"limit": 100}
params[direction] = start_id
while True: while True:
response = requests.get(url, headers=HEADERS, params=params) response = requests.get(url, headers=HEADERS, params=params)
if response.status_code == 429: if response.status_code == 429:
retry_after = response.json().get("retry_after", 1) retry_after = response.json().get("retry_after", 1)
print(f"Rate limited. Sleeping for {retry_after} seconds...") print(f"Rate limited ({direction}). Sleeping {retry_after}s...")
time.sleep(retry_after) time.sleep(retry_after)
continue continue
if response.status_code != 200: if response.status_code != 200:
print(f"Error: {response.status_code} - {response.text}") print(f"Error {direction}: {response.status_code} - {response.text}")
break break
messages = response.json() messages = response.json()
if not messages: if not messages:
break break
# always ordered newest -> oldest, so adjust pagination
if direction == "after":
params["after"] = messages[-1]["id"]
else:
params["before"] = messages[-1]["id"]
for msg in messages: for msg in messages:
if msg['id'] in seen_ids: # Skip duplicates out_queue.put({
continue "id": msg["id"],
seen_ids.add(msg['id']) "username": msg["author"]["username"],
entry = { "message": msg.get("content", ""),
"username": msg['author']['username'], "img": msg["attachments"][0]["url"] if msg.get("attachments") else None,
"message": msg.get('content', ''), "time_sent": msg["timestamp"]
"img": msg['attachments'][0]['url'] if msg.get('attachments') else None, })
"time_sent": msg['timestamp']
}
messages_data.append(entry)
params['after'] = messages[-1]['id'] def scrape_messages(channel_id, time_frame, custom_name):
time.sleep(0.1) after_datetime = parse_time_frame(time_frame)
after_snowflake = datetime_to_snowflake(after_datetime)
# Sort messages chronologically out_queue = Queue()
messages_data.sort(key=lambda x: x['time_sent']) threads = []
# thread 1: forward from cutoff
t1 = threading.Thread(target=fetch_messages, args=(channel_id, "after", after_snowflake, out_queue))
threads.append(t1)
# thread 2: backward from now
t2 = threading.Thread(target=fetch_messages, args=(channel_id, "before", "999999999999999999", out_queue))
threads.append(t2)
for t in threads:
t.start()
messages_data = {}
saved_count = 0
while any(t.is_alive() for t in threads) or not out_queue.empty():
while not out_queue.empty():
msg = out_queue.get()
if msg["id"] not in messages_data: # dedup by ID
messages_data[msg["id"]] = msg
if len(messages_data) - saved_count >= SAVE_INTERVAL:
filename = sanitize_filename(custom_name) + ".json" filename = sanitize_filename(custom_name) + ".json"
with open(filename, 'w', encoding='utf-8') as f: with open(filename, "w", encoding="utf-8") as f:
json.dump(messages_data, f, indent=4) json.dump(sorted(messages_data.values(), key=lambda x: x["time_sent"]), f, indent=4)
saved_count = len(messages_data)
print(f"Progress: {saved_count} messages saved...")
print(f"Scraped {len(messages_data)} messages. Saved to {filename}") time.sleep(0.2)
# --- Main program --- for t in threads:
t.join()
# final save
filename = sanitize_filename(custom_name) + ".json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(sorted(messages_data.values(), key=lambda x: x["time_sent"]), f, indent=4)
print(f"Finished: {len(messages_data)} messages scraped. Saved to {filename}")
# --- Live Monitor ---
def live_monitor(channel_id):
filename = f"channel_{channel_id}_live.json"
last_message_id = None
print("\n[+] Live monitoring started. Press CTRL+C to stop.\n")
try:
while True:
url = f"https://discord.com/api/v10/channels/{channel_id}/messages?limit=50"
if last_message_id:
url += f"&after={last_message_id}"
try:
r = requests.get(url, headers=HEADERS)
if r.status_code != 200:
print(f"[-] Error fetching messages: {r.status_code} {r.text}")
time.sleep(2)
continue
messages = r.json()
except Exception as e:
print(f"[-] Exception: {e}")
time.sleep(2)
continue
messages = sorted(messages, key=lambda x: int(x["id"])) # oldest first
for msg in messages:
author = msg["author"]["username"]
content = msg.get("content", "")
attachments = [a["url"] for a in msg.get("attachments", [])]
print(f"{author}: {content}")
if attachments:
for a in attachments:
print(f"[Attachment] {a}")
save_message_to_json(filename, msg)
last_message_id = msg["id"] # update to latest message
time.sleep(2)
except KeyboardInterrupt:
print("\n[!] Live monitoring stopped. Data saved.")
# --- Main Program ---
print("Choose an option:") print("Choose an option:")
print("1: DMs") print("1: DMs")
print("2: Servers") print("2: Servers")
@ -119,19 +216,33 @@ choice = input("> ")
if choice == "1": if choice == "1":
dms = list_dms() dms = list_dms()
dm_index = int(input("Choose a DM to scrape: ")) dm_index = int(input("Choose a DM: "))
channel_id = dms[dm_index]['id'] channel_id = dms[dm_index]['id']
custom_name = input("Enter custom filename (without extension): ") custom_name = input("Custom filename (no extension): ")
time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
scrape_messages(channel_id, time_frame, custom_name)
elif choice == "2": elif choice == "2":
guilds = list_servers() guilds = list_servers()
guild_index = int(input("Choose a server: ")) guild_index = int(input("Choose a server: "))
channels = list_channels(guilds[guild_index]['id']) channels = list_channels(guilds[guild_index]['id'])
channel_index = int(input("Choose a channel to scrape: ")) channel_index = int(input("Choose a channel: "))
channel_id = channels[channel_index]['id'] channel_id = channels[channel_index]['id']
print("Choose mode:")
print("1: Live-Monitor")
print("2: ScrapeBack")
mode = input("> ")
if mode == "1":
live_monitor(channel_id)
elif mode == "2":
custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}" custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}"
time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
scrape_messages(channel_id, time_frame, custom_name)
else:
print("Invalid choice")
exit()
else: else:
print("Invalid choice") print("Invalid choice")
exit() exit()
time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
scrape_messages(channel_id, time_frame, custom_name)