Search/search/crawling/advanced_crawler.py

from bs4 import BeautifulSoup
import requests
import time
import random
from queue import Queue
from concurrent.futures import ThreadPoolExecutor
import threading
from urllib.parse import urlparse
import csv
from indexing.advanced_indexing import index_page
import sys
import os
# Add the root directory to sys.path
# This is to be able to import modules from other directories (indexing and serving) idk why...
# any imports from indexing/serving need to happen under this
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

# Function to check robots.txt for permission to crawl
# If we don't do this, we could get blocked/banned
# since we don't have permission to crawl.
def can_crawl(url):
    parsed_url = urlparse(url)
    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
    print(f"Checking robots.txt for: {robots_url}")
    time.sleep(random.uniform(1, 3))
    try:
        response = requests.get(robots_url, timeout=5)
        response.raise_for_status()
        disallowed_paths = []
        for line in response.text.splitlines():
            if line.startswith("Disallow"):
                parts = line.split()
                if len(parts) > 1:
                    disallowed_paths.append(parts[1])
        for path in disallowed_paths:
            if urlparse(url).path.startswith(path):
                print(f"Disallowed by robots.txt: {url}")
                return False
        return True
    except requests.RequestException:
        print(f"Failed to access robots.txt: {robots_url}")
        return False  # If we can't access robots.txt, assume we can't crawl (we're being nice here)

# Function to fetch and parse URL
def crawl(args):
    queue = args['queue']
    visited_urls = args['visited_urls']
    crawl_count = args['crawl_count']
    CRAWL_LIMIT = args['CRAWL_LIMIT']
    lock = args['lock']
    index = args['index']
    webpage_info = args['webpage_info']
    webpage_id_counter = args['webpage_id_counter']
    stop_crawl = args['stop_crawl']

    while not stop_crawl.is_set():
        try:
            current_url = queue.get(timeout=5)
            print("Time to crawl: " + current_url)
        except Exception:
            break  # Exit if no more URLs are available to crawl

        with lock:
            if crawl_count[0] >= CRAWL_LIMIT:
                queue.queue.clear()  # Clear remaining URLs to stop processing
                print("Crawl limit reached. Exiting...")
                stop_crawl.set()
                break
            if current_url in visited_urls:
                queue.task_done()
                continue
            visited_urls.add(current_url)

        """ Checks for noindex directive in the page
            Comment this out if you don't care about noindex
            WARNING: websites could block/ban you if you don't have permission
        """
        # if not can_crawl(current_url):
        #     queue.task_done()
        #     continue

        time.sleep(random.uniform(2, 5))
        try:
            response = requests.get(current_url, timeout=5)
            response.raise_for_status()  # Check for request errors
            content = response.content

            """ Checks for noindex directive in the page
            Comment this out if you don't care about noindex
            WARNING: websites could block/ban you if you don't have permission
            """
            # if 'noindex' in content.decode('utf-8').lower():
            #     print(f"Noindex found, skipping: {current_url}")
            #     queue.task_done()
            #     continue


            # Parse the fetched content to find new URLs
            webpage = BeautifulSoup(content, "html.parser")

            # Index the webpage
            indexed_page = index_page(webpage, current_url)
            with lock:
                for word in indexed_page["words"]:
                    if word not in index:
                        index[word] = set()
                    index[word].add(webpage_id_counter[0])
                webpage_info[webpage_id_counter[0]] = indexed_page
                webpage_id_counter[0] += 1

            hyperlinks = webpage.select("a[href]")
            new_urls = parse_links(hyperlinks, current_url)

            with lock:
                for new_url in new_urls:
                    if new_url not in visited_urls:
                        queue.put(new_url)
                crawl_count[0] += 1

        except requests.RequestException as e:
            print(f"Failed to fetch {current_url}: {e}")
        finally:
            queue.task_done()

# Function to parse links from HTML content
def parse_links(hyperlinks, current_url):
    urls = []
    for hyperlink in hyperlinks:
        url = hyperlink["href"]

        # Format the URL into a proper URL
        if url.startswith("#"):
            continue  # Skip same-page anchors
        if url.startswith("//"):
            url = "https:" + url  # Add scheme to protocol-relative URLs
        elif url.startswith("/"):
            # Construct full URL for relative links
            base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url))
            url = base_url + url
        elif not url.startswith("http"):
            continue  # Skip non-HTTP links
        url = url.split("#")[0]  # Remove anchor
        urls.append(url)
    return urls

# Main crawling function
def sloth_bot():
    # Start with the initial pages to crawl
    starting_urls = [
        "https://www.wikipedia.org/wiki/Google",
        "https://www.bbc.com/news/world",
        "https://news.ycombinator.com/",
    ]

    urls_to_crawl = Queue()
    for seed_url in starting_urls:
        urls_to_crawl.put(seed_url)

    visited_urls = set()  # URL tracking
    CRAWL_LIMIT = 20  # Set crawl limit
    crawl_count = [0]  # Shared counter
    lock = threading.Lock()  # Thread safety lock
    index = {}
    webpage_info = {}
    webpage_id_counter = [0]
    stop_crawl = threading.Event()

    # Start concurrent crawling with ThreadPoolExecutor
    #Concurrency = speed
    #Threads go BRRRRR
    #Increase this if you want more threads, but be careful with these.
    NUM_WORKERS = 100
    #Setting up arguments for the crawl function
    args = {
        'queue': urls_to_crawl,
        'visited_urls': visited_urls,
        'crawl_count': crawl_count,
        'CRAWL_LIMIT': CRAWL_LIMIT,
        'lock': lock,
        'index': index,
        'webpage_info': webpage_info,
        'webpage_id_counter': webpage_id_counter,
        'stop_crawl': stop_crawl
    }

    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        for _ in range(NUM_WORKERS):
            executor.submit(crawl, args)

        print("All URLs have been crawled")


    """ This part is for saving the data to CSV files.
        However, if you don't want to save the data, you can remove/comment out this part.
        If you want to use a database, you can replace this part with a database connection.
    """
    with open('advanced_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['word', 'doc_ids']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for word, doc_ids in index.items():
            writer.writerow({'word': word, 'doc_ids': list(doc_ids)})

    with open('advanced_doc_info.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['doc_id', 'url', 'title', 'description']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for doc_id, info in webpage_info.items():
            writer.writerow({
                'doc_id': doc_id,
                'url': info['url'],
                'title': info['title'],
                'description': info['description']
            })

def main():
    # Start the crawling process
    sloth_bot()

if __name__ == "__main__":
    main()