Update scrape.py
Added 2 threads for faster scanning live monitoring option for chat Data should save ever 100 messages for scrapeback mode if you need to keyboard interrupt with cntrl c
This commit is contained in:
parent
ac9e02010b
commit
3e48a9955a
1 changed files with 180 additions and 69 deletions
249
scrape.py
249
scrape.py
|
|
@ -1,8 +1,10 @@
|
||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
import threading
|
||||||
from datetime import datetime, timedelta, timezone
|
from datetime import datetime, timedelta, timezone
|
||||||
import re
|
import re
|
||||||
|
from queue import Queue
|
||||||
|
|
||||||
# --- User input ---
|
# --- User input ---
|
||||||
TOKEN = input("Enter your Discord user token: ")
|
TOKEN = input("Enter your Discord user token: ")
|
||||||
|
|
@ -11,107 +13,202 @@ HEADERS = {
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json"
|
||||||
}
|
}
|
||||||
|
|
||||||
DISCORD_EPOCH = 1420070400000 # Discord epoch in milliseconds
|
DISCORD_EPOCH = 1420070400000 # Discord epoch in ms
|
||||||
|
SAVE_INTERVAL = 200 # save every N messages
|
||||||
|
|
||||||
|
# --- Utility Functions ---
|
||||||
def datetime_to_snowflake(dt):
|
def datetime_to_snowflake(dt):
|
||||||
timestamp_ms = int(dt.timestamp() * 1000)
|
timestamp_ms = int(dt.timestamp() * 1000)
|
||||||
snowflake = (timestamp_ms - DISCORD_EPOCH) << 22
|
return str((timestamp_ms - DISCORD_EPOCH) << 22)
|
||||||
return snowflake
|
|
||||||
|
|
||||||
|
def parse_time_frame(time_frame):
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
num = int(time_frame[:-1])
|
||||||
|
unit = time_frame[-1].lower()
|
||||||
|
if unit == 'h':
|
||||||
|
delta = timedelta(hours=num)
|
||||||
|
elif unit == 'd':
|
||||||
|
delta = timedelta(days=num)
|
||||||
|
elif unit == 'm':
|
||||||
|
delta = timedelta(days=num * 30)
|
||||||
|
else:
|
||||||
|
delta = timedelta(days=1)
|
||||||
|
return now - delta
|
||||||
|
|
||||||
|
def sanitize_filename(name):
|
||||||
|
return re.sub(r'[^A-Za-z0-9_\-]', '_', name)
|
||||||
|
|
||||||
|
def save_message_to_json(filename, msg):
|
||||||
|
try:
|
||||||
|
data = []
|
||||||
|
try:
|
||||||
|
with open(filename, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
# Prevent duplicates
|
||||||
|
if msg["id"] not in [m["id"] for m in data]:
|
||||||
|
data.append({
|
||||||
|
"id": msg["id"],
|
||||||
|
"username": msg["author"]["username"],
|
||||||
|
"message": msg.get("content", ""),
|
||||||
|
"img": msg["attachments"][0]["url"] if msg.get("attachments") else None,
|
||||||
|
"time_sent": msg["timestamp"]
|
||||||
|
})
|
||||||
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, indent=4)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[-] Error saving message: {e}")
|
||||||
|
|
||||||
|
# --- Server & DM Listing ---
|
||||||
def list_dms():
|
def list_dms():
|
||||||
response = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS)
|
r = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS)
|
||||||
dms = response.json()
|
dms = r.json()
|
||||||
for i, dm in enumerate(dms):
|
for i, dm in enumerate(dms):
|
||||||
recipient = dm['recipients'][0]['username'] if dm['type'] == 1 else "Group DM"
|
recipient = dm['recipients'][0]['username'] if dm['type'] == 1 else "Group DM"
|
||||||
print(f"{i}: {recipient}")
|
print(f"{i}: {recipient}")
|
||||||
return dms
|
return dms
|
||||||
|
|
||||||
def list_servers():
|
def list_servers():
|
||||||
response = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS)
|
r = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS)
|
||||||
guilds = response.json()
|
guilds = r.json()
|
||||||
for i, guild in enumerate(guilds):
|
for i, g in enumerate(guilds):
|
||||||
print(f"{i}: {guild['name']}")
|
print(f"{i}: {g['name']}")
|
||||||
return guilds
|
return guilds
|
||||||
|
|
||||||
def list_channels(guild_id):
|
def list_channels(guild_id):
|
||||||
response = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS)
|
r = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS)
|
||||||
channels = response.json()
|
chans = r.json()
|
||||||
for i, channel in enumerate(channels):
|
for i, c in enumerate(chans):
|
||||||
print(f"{i}: {channel['name']} ({channel['id']})")
|
print(f"{i}: {c['name']} ({c['id']})")
|
||||||
return channels
|
return chans
|
||||||
|
|
||||||
def parse_time_frame(time_frame):
|
# --- Historical Scrape ---
|
||||||
now = datetime.now(timezone.utc)
|
def fetch_messages(channel_id, direction, start_id, out_queue):
|
||||||
num = int(time_frame[:-1])
|
|
||||||
unit = time_frame[-1].lower()
|
|
||||||
|
|
||||||
if unit == 'h':
|
|
||||||
delta = timedelta(hours=num)
|
|
||||||
elif unit == 'd':
|
|
||||||
delta = timedelta(days=num)
|
|
||||||
elif unit == 'm':
|
|
||||||
delta = timedelta(days=num*30)
|
|
||||||
else:
|
|
||||||
print("Invalid time format. Using 1 day as default.")
|
|
||||||
delta = timedelta(days=1)
|
|
||||||
|
|
||||||
return now - delta
|
|
||||||
|
|
||||||
def sanitize_filename(name):
|
|
||||||
"""Remove emojis and keep only letters, numbers, underscores, and dashes."""
|
|
||||||
return re.sub(r'[^A-Za-z0-9_\-]', '_', name)
|
|
||||||
|
|
||||||
def scrape_messages(channel_id, time_frame, custom_name):
|
|
||||||
url = f"https://discord.com/api/v10/channels/{channel_id}/messages"
|
url = f"https://discord.com/api/v10/channels/{channel_id}/messages"
|
||||||
after_datetime = parse_time_frame(time_frame)
|
params = {"limit": 100}
|
||||||
after_snowflake = str(datetime_to_snowflake(after_datetime))
|
params[direction] = start_id
|
||||||
messages_data = []
|
|
||||||
seen_ids = set() # Track message IDs to avoid duplicates
|
|
||||||
|
|
||||||
params = {"after": after_snowflake, "limit": 100}
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
response = requests.get(url, headers=HEADERS, params=params)
|
response = requests.get(url, headers=HEADERS, params=params)
|
||||||
|
|
||||||
if response.status_code == 429:
|
if response.status_code == 429:
|
||||||
retry_after = response.json().get("retry_after", 1)
|
retry_after = response.json().get("retry_after", 1)
|
||||||
print(f"Rate limited. Sleeping for {retry_after} seconds...")
|
print(f"Rate limited ({direction}). Sleeping {retry_after}s...")
|
||||||
time.sleep(retry_after)
|
time.sleep(retry_after)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
print(f"Error: {response.status_code} - {response.text}")
|
print(f"Error {direction}: {response.status_code} - {response.text}")
|
||||||
break
|
break
|
||||||
|
|
||||||
messages = response.json()
|
messages = response.json()
|
||||||
if not messages:
|
if not messages:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# always ordered newest -> oldest, so adjust pagination
|
||||||
|
if direction == "after":
|
||||||
|
params["after"] = messages[-1]["id"]
|
||||||
|
else:
|
||||||
|
params["before"] = messages[-1]["id"]
|
||||||
|
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
if msg['id'] in seen_ids: # Skip duplicates
|
out_queue.put({
|
||||||
continue
|
"id": msg["id"],
|
||||||
seen_ids.add(msg['id'])
|
"username": msg["author"]["username"],
|
||||||
entry = {
|
"message": msg.get("content", ""),
|
||||||
"username": msg['author']['username'],
|
"img": msg["attachments"][0]["url"] if msg.get("attachments") else None,
|
||||||
"message": msg.get('content', ''),
|
"time_sent": msg["timestamp"]
|
||||||
"img": msg['attachments'][0]['url'] if msg.get('attachments') else None,
|
})
|
||||||
"time_sent": msg['timestamp']
|
|
||||||
}
|
|
||||||
messages_data.append(entry)
|
|
||||||
|
|
||||||
params['after'] = messages[-1]['id']
|
def scrape_messages(channel_id, time_frame, custom_name):
|
||||||
time.sleep(0.1)
|
after_datetime = parse_time_frame(time_frame)
|
||||||
|
after_snowflake = datetime_to_snowflake(after_datetime)
|
||||||
|
|
||||||
# Sort messages chronologically
|
out_queue = Queue()
|
||||||
messages_data.sort(key=lambda x: x['time_sent'])
|
threads = []
|
||||||
|
|
||||||
|
# thread 1: forward from cutoff
|
||||||
|
t1 = threading.Thread(target=fetch_messages, args=(channel_id, "after", after_snowflake, out_queue))
|
||||||
|
threads.append(t1)
|
||||||
|
|
||||||
|
# thread 2: backward from now
|
||||||
|
t2 = threading.Thread(target=fetch_messages, args=(channel_id, "before", "999999999999999999", out_queue))
|
||||||
|
threads.append(t2)
|
||||||
|
|
||||||
|
for t in threads:
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
messages_data = {}
|
||||||
|
saved_count = 0
|
||||||
|
|
||||||
|
while any(t.is_alive() for t in threads) or not out_queue.empty():
|
||||||
|
while not out_queue.empty():
|
||||||
|
msg = out_queue.get()
|
||||||
|
if msg["id"] not in messages_data: # dedup by ID
|
||||||
|
messages_data[msg["id"]] = msg
|
||||||
|
|
||||||
|
if len(messages_data) - saved_count >= SAVE_INTERVAL:
|
||||||
|
filename = sanitize_filename(custom_name) + ".json"
|
||||||
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(sorted(messages_data.values(), key=lambda x: x["time_sent"]), f, indent=4)
|
||||||
|
saved_count = len(messages_data)
|
||||||
|
print(f"Progress: {saved_count} messages saved...")
|
||||||
|
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
for t in threads:
|
||||||
|
t.join()
|
||||||
|
|
||||||
|
# final save
|
||||||
filename = sanitize_filename(custom_name) + ".json"
|
filename = sanitize_filename(custom_name) + ".json"
|
||||||
with open(filename, 'w', encoding='utf-8') as f:
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
json.dump(messages_data, f, indent=4)
|
json.dump(sorted(messages_data.values(), key=lambda x: x["time_sent"]), f, indent=4)
|
||||||
|
|
||||||
print(f"Scraped {len(messages_data)} messages. Saved to {filename}")
|
print(f"Finished: {len(messages_data)} messages scraped. Saved to {filename}")
|
||||||
|
|
||||||
# --- Main program ---
|
# --- Live Monitor ---
|
||||||
|
def live_monitor(channel_id):
|
||||||
|
filename = f"channel_{channel_id}_live.json"
|
||||||
|
last_message_id = None
|
||||||
|
print("\n[+] Live monitoring started. Press CTRL+C to stop.\n")
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
url = f"https://discord.com/api/v10/channels/{channel_id}/messages?limit=50"
|
||||||
|
if last_message_id:
|
||||||
|
url += f"&after={last_message_id}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = requests.get(url, headers=HEADERS)
|
||||||
|
if r.status_code != 200:
|
||||||
|
print(f"[-] Error fetching messages: {r.status_code} {r.text}")
|
||||||
|
time.sleep(2)
|
||||||
|
continue
|
||||||
|
messages = r.json()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[-] Exception: {e}")
|
||||||
|
time.sleep(2)
|
||||||
|
continue
|
||||||
|
|
||||||
|
messages = sorted(messages, key=lambda x: int(x["id"])) # oldest first
|
||||||
|
|
||||||
|
for msg in messages:
|
||||||
|
author = msg["author"]["username"]
|
||||||
|
content = msg.get("content", "")
|
||||||
|
attachments = [a["url"] for a in msg.get("attachments", [])]
|
||||||
|
print(f"{author}: {content}")
|
||||||
|
if attachments:
|
||||||
|
for a in attachments:
|
||||||
|
print(f"[Attachment] {a}")
|
||||||
|
save_message_to_json(filename, msg)
|
||||||
|
last_message_id = msg["id"] # update to latest message
|
||||||
|
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n[!] Live monitoring stopped. Data saved.")
|
||||||
|
|
||||||
|
# --- Main Program ---
|
||||||
print("Choose an option:")
|
print("Choose an option:")
|
||||||
print("1: DMs")
|
print("1: DMs")
|
||||||
print("2: Servers")
|
print("2: Servers")
|
||||||
|
|
@ -119,19 +216,33 @@ choice = input("> ")
|
||||||
|
|
||||||
if choice == "1":
|
if choice == "1":
|
||||||
dms = list_dms()
|
dms = list_dms()
|
||||||
dm_index = int(input("Choose a DM to scrape: "))
|
dm_index = int(input("Choose a DM: "))
|
||||||
channel_id = dms[dm_index]['id']
|
channel_id = dms[dm_index]['id']
|
||||||
custom_name = input("Enter custom filename (without extension): ")
|
custom_name = input("Custom filename (no extension): ")
|
||||||
|
time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
|
||||||
|
scrape_messages(channel_id, time_frame, custom_name)
|
||||||
|
|
||||||
elif choice == "2":
|
elif choice == "2":
|
||||||
guilds = list_servers()
|
guilds = list_servers()
|
||||||
guild_index = int(input("Choose a server: "))
|
guild_index = int(input("Choose a server: "))
|
||||||
channels = list_channels(guilds[guild_index]['id'])
|
channels = list_channels(guilds[guild_index]['id'])
|
||||||
channel_index = int(input("Choose a channel to scrape: "))
|
channel_index = int(input("Choose a channel: "))
|
||||||
channel_id = channels[channel_index]['id']
|
channel_id = channels[channel_index]['id']
|
||||||
custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}"
|
|
||||||
|
print("Choose mode:")
|
||||||
|
print("1: Live-Monitor")
|
||||||
|
print("2: ScrapeBack")
|
||||||
|
mode = input("> ")
|
||||||
|
|
||||||
|
if mode == "1":
|
||||||
|
live_monitor(channel_id)
|
||||||
|
elif mode == "2":
|
||||||
|
custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}"
|
||||||
|
time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
|
||||||
|
scrape_messages(channel_id, time_frame, custom_name)
|
||||||
|
else:
|
||||||
|
print("Invalid choice")
|
||||||
|
exit()
|
||||||
else:
|
else:
|
||||||
print("Invalid choice")
|
print("Invalid choice")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
|
|
||||||
scrape_messages(channel_id, time_frame, custom_name)
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue