diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..638a835 --- /dev/null +++ b/scrape.py @@ -0,0 +1,137 @@ +import requests +import json +import time +from datetime import datetime, timedelta, timezone +import re + +# --- User input --- +TOKEN = input("Enter your Discord user token: ") +HEADERS = { + "Authorization": TOKEN, + "Content-Type": "application/json" +} + +DISCORD_EPOCH = 1420070400000 # Discord epoch in milliseconds + +def datetime_to_snowflake(dt): + timestamp_ms = int(dt.timestamp() * 1000) + snowflake = (timestamp_ms - DISCORD_EPOCH) << 22 + return snowflake + +def list_dms(): + response = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS) + dms = response.json() + for i, dm in enumerate(dms): + recipient = dm['recipients'][0]['username'] if dm['type'] == 1 else "Group DM" + print(f"{i}: {recipient}") + return dms + +def list_servers(): + response = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS) + guilds = response.json() + for i, guild in enumerate(guilds): + print(f"{i}: {guild['name']}") + return guilds + +def list_channels(guild_id): + response = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS) + channels = response.json() + for i, channel in enumerate(channels): + print(f"{i}: {channel['name']} ({channel['id']})") + return channels + +def parse_time_frame(time_frame): + now = datetime.now(timezone.utc) + num = int(time_frame[:-1]) + unit = time_frame[-1].lower() + + if unit == 'h': + delta = timedelta(hours=num) + elif unit == 'd': + delta = timedelta(days=num) + elif unit == 'm': + delta = timedelta(days=num*30) + else: + print("Invalid time format. Using 1 day as default.") + delta = timedelta(days=1) + + return now - delta + +def sanitize_filename(name): + """Remove emojis and keep only letters, numbers, underscores, and dashes.""" + return re.sub(r'[^A-Za-z0-9_\-]', '_', name) + +def scrape_messages(channel_id, time_frame, custom_name): + url = f"https://discord.com/api/v10/channels/{channel_id}/messages" + after_datetime = parse_time_frame(time_frame) + after_snowflake = str(datetime_to_snowflake(after_datetime)) + messages_data = [] + seen_ids = set() # Track message IDs to avoid duplicates + + params = {"after": after_snowflake, "limit": 100} + + while True: + response = requests.get(url, headers=HEADERS, params=params) + + if response.status_code == 429: + retry_after = response.json().get("retry_after", 1) + print(f"Rate limited. Sleeping for {retry_after} seconds...") + time.sleep(retry_after) + continue + + if response.status_code != 200: + print(f"Error: {response.status_code} - {response.text}") + break + + messages = response.json() + if not messages: + break + + for msg in messages: + if msg['id'] in seen_ids: # Skip duplicates + continue + seen_ids.add(msg['id']) + entry = { + "username": msg['author']['username'], + "message": msg.get('content', ''), + "img": msg['attachments'][0]['url'] if msg.get('attachments') else None, + "time_sent": msg['timestamp'] + } + messages_data.append(entry) + + params['after'] = messages[-1]['id'] + time.sleep(0.1) + + # Sort messages chronologically + messages_data.sort(key=lambda x: x['time_sent']) + + filename = sanitize_filename(custom_name) + ".json" + with open(filename, 'w', encoding='utf-8') as f: + json.dump(messages_data, f, indent=4) + + print(f"Scraped {len(messages_data)} messages. Saved to {filename}") + +# --- Main program --- +print("Choose an option:") +print("1: DMs") +print("2: Servers") +choice = input("> ") + +if choice == "1": + dms = list_dms() + dm_index = int(input("Choose a DM to scrape: ")) + channel_id = dms[dm_index]['id'] + custom_name = input("Enter custom filename (without extension): ") +elif choice == "2": + guilds = list_servers() + guild_index = int(input("Choose a server: ")) + channels = list_channels(guilds[guild_index]['id']) + channel_index = int(input("Choose a channel to scrape: ")) + channel_id = channels[channel_index]['id'] + custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}" +else: + print("Invalid choice") + exit() + +time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ") +scrape_messages(channel_id, time_frame, custom_name)