Update scrape.py

Added 2 threads for faster scanning 
live monitoring option for chat
Data should save ever 100 messages for scrapeback mode if you need to keyboard interrupt with cntrl c
This commit is contained in:
koshiro 2025-09-20 17:03:57 -05:00 committed by GitHub
parent ac9e02010b
commit 3e48a9955a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

249
scrape.py
View file

@ -1,8 +1,10 @@
import requests import requests
import json import json
import time import time
import threading
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
import re import re
from queue import Queue
# --- User input --- # --- User input ---
TOKEN = input("Enter your Discord user token: ") TOKEN = input("Enter your Discord user token: ")
@ -11,107 +13,202 @@ HEADERS = {
"Content-Type": "application/json" "Content-Type": "application/json"
} }
DISCORD_EPOCH = 1420070400000 # Discord epoch in milliseconds DISCORD_EPOCH = 1420070400000 # Discord epoch in ms
SAVE_INTERVAL = 200 # save every N messages
# --- Utility Functions ---
def datetime_to_snowflake(dt): def datetime_to_snowflake(dt):
timestamp_ms = int(dt.timestamp() * 1000) timestamp_ms = int(dt.timestamp() * 1000)
snowflake = (timestamp_ms - DISCORD_EPOCH) << 22 return str((timestamp_ms - DISCORD_EPOCH) << 22)
return snowflake
def parse_time_frame(time_frame):
now = datetime.now(timezone.utc)
num = int(time_frame[:-1])
unit = time_frame[-1].lower()
if unit == 'h':
delta = timedelta(hours=num)
elif unit == 'd':
delta = timedelta(days=num)
elif unit == 'm':
delta = timedelta(days=num * 30)
else:
delta = timedelta(days=1)
return now - delta
def sanitize_filename(name):
return re.sub(r'[^A-Za-z0-9_\-]', '_', name)
def save_message_to_json(filename, msg):
try:
data = []
try:
with open(filename, "r", encoding="utf-8") as f:
data = json.load(f)
except FileNotFoundError:
pass
# Prevent duplicates
if msg["id"] not in [m["id"] for m in data]:
data.append({
"id": msg["id"],
"username": msg["author"]["username"],
"message": msg.get("content", ""),
"img": msg["attachments"][0]["url"] if msg.get("attachments") else None,
"time_sent": msg["timestamp"]
})
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4)
except Exception as e:
print(f"[-] Error saving message: {e}")
# --- Server & DM Listing ---
def list_dms(): def list_dms():
response = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS) r = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS)
dms = response.json() dms = r.json()
for i, dm in enumerate(dms): for i, dm in enumerate(dms):
recipient = dm['recipients'][0]['username'] if dm['type'] == 1 else "Group DM" recipient = dm['recipients'][0]['username'] if dm['type'] == 1 else "Group DM"
print(f"{i}: {recipient}") print(f"{i}: {recipient}")
return dms return dms
def list_servers(): def list_servers():
response = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS) r = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS)
guilds = response.json() guilds = r.json()
for i, guild in enumerate(guilds): for i, g in enumerate(guilds):
print(f"{i}: {guild['name']}") print(f"{i}: {g['name']}")
return guilds return guilds
def list_channels(guild_id): def list_channels(guild_id):
response = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS) r = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS)
channels = response.json() chans = r.json()
for i, channel in enumerate(channels): for i, c in enumerate(chans):
print(f"{i}: {channel['name']} ({channel['id']})") print(f"{i}: {c['name']} ({c['id']})")
return channels return chans
def parse_time_frame(time_frame): # --- Historical Scrape ---
now = datetime.now(timezone.utc) def fetch_messages(channel_id, direction, start_id, out_queue):
num = int(time_frame[:-1])
unit = time_frame[-1].lower()
if unit == 'h':
delta = timedelta(hours=num)
elif unit == 'd':
delta = timedelta(days=num)
elif unit == 'm':
delta = timedelta(days=num*30)
else:
print("Invalid time format. Using 1 day as default.")
delta = timedelta(days=1)
return now - delta
def sanitize_filename(name):
"""Remove emojis and keep only letters, numbers, underscores, and dashes."""
return re.sub(r'[^A-Za-z0-9_\-]', '_', name)
def scrape_messages(channel_id, time_frame, custom_name):
url = f"https://discord.com/api/v10/channels/{channel_id}/messages" url = f"https://discord.com/api/v10/channels/{channel_id}/messages"
after_datetime = parse_time_frame(time_frame) params = {"limit": 100}
after_snowflake = str(datetime_to_snowflake(after_datetime)) params[direction] = start_id
messages_data = []
seen_ids = set() # Track message IDs to avoid duplicates
params = {"after": after_snowflake, "limit": 100}
while True: while True:
response = requests.get(url, headers=HEADERS, params=params) response = requests.get(url, headers=HEADERS, params=params)
if response.status_code == 429: if response.status_code == 429:
retry_after = response.json().get("retry_after", 1) retry_after = response.json().get("retry_after", 1)
print(f"Rate limited. Sleeping for {retry_after} seconds...") print(f"Rate limited ({direction}). Sleeping {retry_after}s...")
time.sleep(retry_after) time.sleep(retry_after)
continue continue
if response.status_code != 200: if response.status_code != 200:
print(f"Error: {response.status_code} - {response.text}") print(f"Error {direction}: {response.status_code} - {response.text}")
break break
messages = response.json() messages = response.json()
if not messages: if not messages:
break break
# always ordered newest -> oldest, so adjust pagination
if direction == "after":
params["after"] = messages[-1]["id"]
else:
params["before"] = messages[-1]["id"]
for msg in messages: for msg in messages:
if msg['id'] in seen_ids: # Skip duplicates out_queue.put({
continue "id": msg["id"],
seen_ids.add(msg['id']) "username": msg["author"]["username"],
entry = { "message": msg.get("content", ""),
"username": msg['author']['username'], "img": msg["attachments"][0]["url"] if msg.get("attachments") else None,
"message": msg.get('content', ''), "time_sent": msg["timestamp"]
"img": msg['attachments'][0]['url'] if msg.get('attachments') else None, })
"time_sent": msg['timestamp']
}
messages_data.append(entry)
params['after'] = messages[-1]['id'] def scrape_messages(channel_id, time_frame, custom_name):
time.sleep(0.1) after_datetime = parse_time_frame(time_frame)
after_snowflake = datetime_to_snowflake(after_datetime)
# Sort messages chronologically out_queue = Queue()
messages_data.sort(key=lambda x: x['time_sent']) threads = []
# thread 1: forward from cutoff
t1 = threading.Thread(target=fetch_messages, args=(channel_id, "after", after_snowflake, out_queue))
threads.append(t1)
# thread 2: backward from now
t2 = threading.Thread(target=fetch_messages, args=(channel_id, "before", "999999999999999999", out_queue))
threads.append(t2)
for t in threads:
t.start()
messages_data = {}
saved_count = 0
while any(t.is_alive() for t in threads) or not out_queue.empty():
while not out_queue.empty():
msg = out_queue.get()
if msg["id"] not in messages_data: # dedup by ID
messages_data[msg["id"]] = msg
if len(messages_data) - saved_count >= SAVE_INTERVAL:
filename = sanitize_filename(custom_name) + ".json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(sorted(messages_data.values(), key=lambda x: x["time_sent"]), f, indent=4)
saved_count = len(messages_data)
print(f"Progress: {saved_count} messages saved...")
time.sleep(0.2)
for t in threads:
t.join()
# final save
filename = sanitize_filename(custom_name) + ".json" filename = sanitize_filename(custom_name) + ".json"
with open(filename, 'w', encoding='utf-8') as f: with open(filename, "w", encoding="utf-8") as f:
json.dump(messages_data, f, indent=4) json.dump(sorted(messages_data.values(), key=lambda x: x["time_sent"]), f, indent=4)
print(f"Scraped {len(messages_data)} messages. Saved to {filename}") print(f"Finished: {len(messages_data)} messages scraped. Saved to {filename}")
# --- Main program --- # --- Live Monitor ---
def live_monitor(channel_id):
filename = f"channel_{channel_id}_live.json"
last_message_id = None
print("\n[+] Live monitoring started. Press CTRL+C to stop.\n")
try:
while True:
url = f"https://discord.com/api/v10/channels/{channel_id}/messages?limit=50"
if last_message_id:
url += f"&after={last_message_id}"
try:
r = requests.get(url, headers=HEADERS)
if r.status_code != 200:
print(f"[-] Error fetching messages: {r.status_code} {r.text}")
time.sleep(2)
continue
messages = r.json()
except Exception as e:
print(f"[-] Exception: {e}")
time.sleep(2)
continue
messages = sorted(messages, key=lambda x: int(x["id"])) # oldest first
for msg in messages:
author = msg["author"]["username"]
content = msg.get("content", "")
attachments = [a["url"] for a in msg.get("attachments", [])]
print(f"{author}: {content}")
if attachments:
for a in attachments:
print(f"[Attachment] {a}")
save_message_to_json(filename, msg)
last_message_id = msg["id"] # update to latest message
time.sleep(2)
except KeyboardInterrupt:
print("\n[!] Live monitoring stopped. Data saved.")
# --- Main Program ---
print("Choose an option:") print("Choose an option:")
print("1: DMs") print("1: DMs")
print("2: Servers") print("2: Servers")
@ -119,19 +216,33 @@ choice = input("> ")
if choice == "1": if choice == "1":
dms = list_dms() dms = list_dms()
dm_index = int(input("Choose a DM to scrape: ")) dm_index = int(input("Choose a DM: "))
channel_id = dms[dm_index]['id'] channel_id = dms[dm_index]['id']
custom_name = input("Enter custom filename (without extension): ") custom_name = input("Custom filename (no extension): ")
time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
scrape_messages(channel_id, time_frame, custom_name)
elif choice == "2": elif choice == "2":
guilds = list_servers() guilds = list_servers()
guild_index = int(input("Choose a server: ")) guild_index = int(input("Choose a server: "))
channels = list_channels(guilds[guild_index]['id']) channels = list_channels(guilds[guild_index]['id'])
channel_index = int(input("Choose a channel to scrape: ")) channel_index = int(input("Choose a channel: "))
channel_id = channels[channel_index]['id'] channel_id = channels[channel_index]['id']
custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}"
print("Choose mode:")
print("1: Live-Monitor")
print("2: ScrapeBack")
mode = input("> ")
if mode == "1":
live_monitor(channel_id)
elif mode == "2":
custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}"
time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
scrape_messages(channel_id, time_frame, custom_name)
else:
print("Invalid choice")
exit()
else: else:
print("Invalid choice") print("Invalid choice")
exit() exit()
time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
scrape_messages(channel_id, time_frame, custom_name)