Update scrape.py

Added 2 threads for faster scanning live monitoring option for chat Data should save ever 100 messages for scrapeback mode if you need to keyboard interrupt with cntrl c
2025-09-20 17:03:57 -05:00 · 2025-09-20 17:03:57 -05:00 · 3e48a9955a
commit 3e48a9955a
parent ac9e02010b
1 changed files with 180 additions and 69 deletions
--- a/scrape.py
+++ b/scrape.py
@ -1,8 +1,10 @@
 import requests
 import json
 import time
+import threading
 from datetime import datetime, timedelta, timezone
 import re
+from queue import Queue

 # --- User input ---
 TOKEN = input("Enter your Discord user token: ")
@ -11,107 +13,202 @@ HEADERS = {
    "Content-Type": "application/json"
 }

-DISCORD_EPOCH = 1420070400000  # Discord epoch in milliseconds
+DISCORD_EPOCH = 1420070400000  # Discord epoch in ms
+SAVE_INTERVAL = 200  # save every N messages

+# --- Utility Functions ---
 def datetime_to_snowflake(dt):
    timestamp_ms = int(dt.timestamp() * 1000)
-    snowflake = (timestamp_ms - DISCORD_EPOCH) << 22
-    return snowflake
+    return str((timestamp_ms - DISCORD_EPOCH) << 22)

+def parse_time_frame(time_frame):
+    now = datetime.now(timezone.utc)
+    num = int(time_frame[:-1])
+    unit = time_frame[-1].lower()
+    if unit == 'h':
+        delta = timedelta(hours=num)
+    elif unit == 'd':
+        delta = timedelta(days=num)
+    elif unit == 'm':
+        delta = timedelta(days=num * 30)
+    else:
+        delta = timedelta(days=1)
+    return now - delta
+
+def sanitize_filename(name):
+    return re.sub(r'[^A-Za-z0-9_\-]', '_', name)
+
+def save_message_to_json(filename, msg):
+    try:
+        data = []
+        try:
+            with open(filename, "r", encoding="utf-8") as f:
+                data = json.load(f)
+        except FileNotFoundError:
+            pass
+        # Prevent duplicates
+        if msg["id"] not in [m["id"] for m in data]:
+            data.append({
+                "id": msg["id"],
+                "username": msg["author"]["username"],
+                "message": msg.get("content", ""),
+                "img": msg["attachments"][0]["url"] if msg.get("attachments") else None,
+                "time_sent": msg["timestamp"]
+            })
+            with open(filename, "w", encoding="utf-8") as f:
+                json.dump(data, f, indent=4)
+    except Exception as e:
+        print(f"[-] Error saving message: {e}")
+
+# --- Server & DM Listing ---
 def list_dms():
-    response = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS)
-    dms = response.json()
+    r = requests.get("https://discord.com/api/v10/users/@me/channels", headers=HEADERS)
+    dms = r.json()
    for i, dm in enumerate(dms):
        recipient = dm['recipients'][0]['username'] if dm['type'] == 1 else "Group DM"
        print(f"{i}: {recipient}")
    return dms

 def list_servers():
-    response = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS)
-    guilds = response.json()
-    for i, guild in enumerate(guilds):
-        print(f"{i}: {guild['name']}")
+    r = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=HEADERS)
+    guilds = r.json()
+    for i, g in enumerate(guilds):
+        print(f"{i}: {g['name']}")
    return guilds

 def list_channels(guild_id):
-    response = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS)
-    channels = response.json()
-    for i, channel in enumerate(channels):
-        print(f"{i}: {channel['name']} ({channel['id']})")
-    return channels
+    r = requests.get(f"https://discord.com/api/v10/guilds/{guild_id}/channels", headers=HEADERS)
+    chans = r.json()
+    for i, c in enumerate(chans):
+        print(f"{i}: {c['name']} ({c['id']})")
+    return chans

-def parse_time_frame(time_frame):
-    now = datetime.now(timezone.utc)
-    num = int(time_frame[:-1])
-    unit = time_frame[-1].lower()
-    
-    if unit == 'h':
-        delta = timedelta(hours=num)
-    elif unit == 'd':
-        delta = timedelta(days=num)
-    elif unit == 'm':
-        delta = timedelta(days=num*30)
-    else:
-        print("Invalid time format. Using 1 day as default.")
-        delta = timedelta(days=1)
-    
-    return now - delta
-
-def sanitize_filename(name):
-    """Remove emojis and keep only letters, numbers, underscores, and dashes."""
-    return re.sub(r'[^A-Za-z0-9_\-]', '_', name)
-
-def scrape_messages(channel_id, time_frame, custom_name):
+# --- Historical Scrape ---
+def fetch_messages(channel_id, direction, start_id, out_queue):
    url = f"https://discord.com/api/v10/channels/{channel_id}/messages"
-    after_datetime = parse_time_frame(time_frame)
-    after_snowflake = str(datetime_to_snowflake(after_datetime))
-    messages_data = []
-    seen_ids = set()  # Track message IDs to avoid duplicates
-
-    params = {"after": after_snowflake, "limit": 100}
+    params = {"limit": 100}
+    params[direction] = start_id

    while True:
        response = requests.get(url, headers=HEADERS, params=params)

        if response.status_code == 429:
            retry_after = response.json().get("retry_after", 1)
-            print(f"Rate limited. Sleeping for {retry_after} seconds...")
+            print(f"Rate limited ({direction}). Sleeping {retry_after}s...")
            time.sleep(retry_after)
            continue

        if response.status_code != 200:
-            print(f"Error: {response.status_code} - {response.text}")
+            print(f"Error {direction}: {response.status_code} - {response.text}")
            break

        messages = response.json()
        if not messages:
            break

+        # always ordered newest -> oldest, so adjust pagination
+        if direction == "after":
+            params["after"] = messages[-1]["id"]
+        else:
+            params["before"] = messages[-1]["id"]
+
        for msg in messages:
-            if msg['id'] in seen_ids:  # Skip duplicates
-                continue
-            seen_ids.add(msg['id'])
-            entry = {
-                "username": msg['author']['username'],
-                "message": msg.get('content', ''),
-                "img": msg['attachments'][0]['url'] if msg.get('attachments') else None,
-                "time_sent": msg['timestamp']
-            }
-            messages_data.append(entry)
+            out_queue.put({
+                "id": msg["id"],
+                "username": msg["author"]["username"],
+                "message": msg.get("content", ""),
+                "img": msg["attachments"][0]["url"] if msg.get("attachments") else None,
+                "time_sent": msg["timestamp"]
+            })

-        params['after'] = messages[-1]['id']
-        time.sleep(0.1)
+def scrape_messages(channel_id, time_frame, custom_name):
+    after_datetime = parse_time_frame(time_frame)
+    after_snowflake = datetime_to_snowflake(after_datetime)

-    # Sort messages chronologically
-    messages_data.sort(key=lambda x: x['time_sent'])
+    out_queue = Queue()
+    threads = []

+    # thread 1: forward from cutoff
+    t1 = threading.Thread(target=fetch_messages, args=(channel_id, "after", after_snowflake, out_queue))
+    threads.append(t1)
+
+    # thread 2: backward from now
+    t2 = threading.Thread(target=fetch_messages, args=(channel_id, "before", "999999999999999999", out_queue))
+    threads.append(t2)
+
+    for t in threads:
+        t.start()
+
+    messages_data = {}
+    saved_count = 0
+
+    while any(t.is_alive() for t in threads) or not out_queue.empty():
+        while not out_queue.empty():
+            msg = out_queue.get()
+            if msg["id"] not in messages_data:  # dedup by ID
+                messages_data[msg["id"]] = msg
+
+        if len(messages_data) - saved_count >= SAVE_INTERVAL:
+            filename = sanitize_filename(custom_name) + ".json"
+            with open(filename, "w", encoding="utf-8") as f:
+                json.dump(sorted(messages_data.values(), key=lambda x: x["time_sent"]), f, indent=4)
+            saved_count = len(messages_data)
+            print(f"Progress: {saved_count} messages saved...")
+
+        time.sleep(0.2)
+
+    for t in threads:
+        t.join()
+
+    # final save
    filename = sanitize_filename(custom_name) + ".json"
-    with open(filename, 'w', encoding='utf-8') as f:
-        json.dump(messages_data, f, indent=4)
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(sorted(messages_data.values(), key=lambda x: x["time_sent"]), f, indent=4)

-    print(f"Scraped {len(messages_data)} messages. Saved to {filename}")
+    print(f"Finished: {len(messages_data)} messages scraped. Saved to {filename}")

-# --- Main program ---
+# --- Live Monitor ---
+def live_monitor(channel_id):
+    filename = f"channel_{channel_id}_live.json"
+    last_message_id = None
+    print("\n[+] Live monitoring started. Press CTRL+C to stop.\n")
+    try:
+        while True:
+            url = f"https://discord.com/api/v10/channels/{channel_id}/messages?limit=50"
+            if last_message_id:
+                url += f"&after={last_message_id}"
+
+            try:
+                r = requests.get(url, headers=HEADERS)
+                if r.status_code != 200:
+                    print(f"[-] Error fetching messages: {r.status_code} {r.text}")
+                    time.sleep(2)
+                    continue
+                messages = r.json()
+            except Exception as e:
+                print(f"[-] Exception: {e}")
+                time.sleep(2)
+                continue
+
+            messages = sorted(messages, key=lambda x: int(x["id"]))  # oldest first
+
+            for msg in messages:
+                author = msg["author"]["username"]
+                content = msg.get("content", "")
+                attachments = [a["url"] for a in msg.get("attachments", [])]
+                print(f"{author}: {content}")
+                if attachments:
+                    for a in attachments:
+                        print(f"[Attachment] {a}")
+                save_message_to_json(filename, msg)
+                last_message_id = msg["id"]  # update to latest message
+
+            time.sleep(2)
+
+    except KeyboardInterrupt:
+        print("\n[!] Live monitoring stopped. Data saved.")
+
+# --- Main Program ---
 print("Choose an option:")
 print("1: DMs")
 print("2: Servers")
@ -119,19 +216,33 @@ choice = input("> ")

 if choice == "1":
    dms = list_dms()
-    dm_index = int(input("Choose a DM to scrape: "))
+    dm_index = int(input("Choose a DM: "))
    channel_id = dms[dm_index]['id']
-    custom_name = input("Enter custom filename (without extension): ")
+    custom_name = input("Custom filename (no extension): ")
+    time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
+    scrape_messages(channel_id, time_frame, custom_name)
+
 elif choice == "2":
    guilds = list_servers()
    guild_index = int(input("Choose a server: "))
    channels = list_channels(guilds[guild_index]['id'])
-    channel_index = int(input("Choose a channel to scrape: "))
+    channel_index = int(input("Choose a channel: "))
    channel_id = channels[channel_index]['id']
-    custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}"
+
+    print("Choose mode:")
+    print("1: Live-Monitor")
+    print("2: ScrapeBack")
+    mode = input("> ")
+
+    if mode == "1":
+        live_monitor(channel_id)
+    elif mode == "2":
+        custom_name = f"{guilds[guild_index]['name']}_{channels[channel_index]['name']}"
+        time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
+        scrape_messages(channel_id, time_frame, custom_name)
+    else:
+        print("Invalid choice")
+        exit()
 else:
    print("Invalid choice")
    exit()
-
-time_frame = input("Enter time frame (e.g., 1-99h, 1-99d, 1-99m): ")
-scrape_messages(channel_id, time_frame, custom_name)