Create scrape.py
This commit is contained in:
parent
4aab66b7f0
commit
2c4f19afe4
1 changed files with 137 additions and 0 deletions
137
scrape.py
Normal file
137
scrape.py
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
import requests
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import re
|
||||
|
||||
# --- User input ---
|
||||
TOKEN = input("Enter your Discord user token: ")
|
||||
HEADERS = {
|
||||
"Authorization": TOKEN,
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
DISCORD_EPOCH = 1420070400000 # Discord epoch in milliseconds
|
||||
|
||||
def datetime_to_snowflake(dt):
|
||||
timestamp_ms = int(dt.timestamp() * 1000)
|
||||
snowflake = (timestamp_ms - DISCORD_EPOCH) << 22
|
||||
return snowflake
|
||||
|
||||
def list_dms():
|
||||
response = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS)
|
||||
dms = response.json()
|
||||
for i, dm in enumerate(dms):
|
||||
recipient = dm['recipients'][0]['username'] if dm['type'] == 1 else "Group DM"
|
||||
print(f"{i}: {recipient}")
|
||||
return dms
|
||||
|
||||
def list_servers():
|
||||
response = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS)
|
||||
guilds = response.json()
|
||||
for i, guild in enumerate(guilds):
|
||||
print(f"{i}: {guild['name']}")
|
||||
return guilds
|
||||
|
||||
def list_channels(guild_id):
|
||||
response = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS)
|
||||
channels = response.json()
|
||||
for i, channel in enumerate(channels):
|
||||
print(f"{i}: {channel['name']} ({channel['id']})")
|
||||
return channels
|
||||
|
||||
def parse_time_frame(time_frame):
|
||||
now = datetime.now(timezone.utc)
|
||||
num = int(time_frame[:-1])
|
||||
unit = time_frame[-1].lower()
|
||||
|
||||
if unit == 'h':
|
||||
delta = timedelta(hours=num)
|
||||
elif unit == 'd':
|
||||
delta = timedelta(days=num)
|
||||
elif unit == 'm':
|
||||
delta = timedelta(days=num*30)
|
||||
else:
|
||||
print("Invalid time format. Using 1 day as default.")
|
||||
delta = timedelta(days=1)
|
||||
|
||||
return now - delta
|
||||
|
||||
def sanitize_filename(name):
|
||||
"""Remove emojis and keep only letters, numbers, underscores, and dashes."""
|
||||
return re.sub(r'[^A-Za-z0-9_\-]', '_', name)
|
||||
|
||||
def scrape_messages(channel_id, time_frame, custom_name):
|
||||
url = f"https://discord.com/api/v10/channels/{channel_id}/messages"
|
||||
after_datetime = parse_time_frame(time_frame)
|
||||
after_snowflake = str(datetime_to_snowflake(after_datetime))
|
||||
messages_data = []
|
||||
seen_ids = set() # Track message IDs to avoid duplicates
|
||||
|
||||
params = {"after": after_snowflake, "limit": 100}
|
||||
|
||||
while True:
|
||||
response = requests.get(url, headers=HEADERS, params=params)
|
||||
|
||||
if response.status_code == 429:
|
||||
retry_after = response.json().get("retry_after", 1)
|
||||
print(f"Rate limited. Sleeping for {retry_after} seconds...")
|
||||
time.sleep(retry_after)
|
||||
continue
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Error: {response.status_code} - {response.text}")
|
||||
break
|
||||
|
||||
messages = response.json()
|
||||
if not messages:
|
||||
break
|
||||
|
||||
for msg in messages:
|
||||
if msg['id'] in seen_ids: # Skip duplicates
|
||||
continue
|
||||
seen_ids.add(msg['id'])
|
||||
entry = {
|
||||
"username": msg['author']['username'],
|
||||
"message": msg.get('content', ''),
|
||||
"img": msg['attachments'][0]['url'] if msg.get('attachments') else None,
|
||||
"time_sent": msg['timestamp']
|
||||
}
|
||||
messages_data.append(entry)
|
||||
|
||||
params['after'] = messages[-1]['id']
|
||||
time.sleep(0.1)
|
||||
|
||||
# Sort messages chronologically
|
||||
messages_data.sort(key=lambda x: x['time_sent'])
|
||||
|
||||
filename = sanitize_filename(custom_name) + ".json"
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(messages_data, f, indent=4)
|
||||
|
||||
print(f"Scraped {len(messages_data)} messages. Saved to {filename}")
|
||||
|
||||
# --- Main program ---
|
||||
print("Choose an option:")
|
||||
print("1: DMs")
|
||||
print("2: Servers")
|
||||
choice = input("> ")
|
||||
|
||||
if choice == "1":
|
||||
dms = list_dms()
|
||||
dm_index = int(input("Choose a DM to scrape: "))
|
||||
channel_id = dms[dm_index]['id']
|
||||
custom_name = input("Enter custom filename (without extension): ")
|
||||
elif choice == "2":
|
||||
guilds = list_servers()
|
||||
guild_index = int(input("Choose a server: "))
|
||||
channels = list_channels(guilds[guild_index]['id'])
|
||||
channel_index = int(input("Choose a channel to scrape: "))
|
||||
channel_id = channels[channel_index]['id']
|
||||
custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}"
|
||||
else:
|
||||
print("Invalid choice")
|
||||
exit()
|
||||
|
||||
time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
|
||||
scrape_messages(channel_id, time_frame, custom_name)
|
||||
Loading…
Add table
Reference in a new issue