Search/search/crawling/advanced_crawler.py
The Coding Sloth 7771dffa6e first commit
2024-11-07 17:30:24 -05:00

224 lines
8 KiB
Python

from bs4 import BeautifulSoup
import requests
import time
import random
from queue import Queue
from concurrent.futures import ThreadPoolExecutor
import threading
from urllib.parse import urlparse
import csv
from indexing.advanced_indexing import index_page
import sys
import os
# Add the root directory to sys.path
# This is to be able to import modules from other directories (indexing and serving) idk why...
# any imports from indexing/serving need to happen under this
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
# Function to check robots.txt for permission to crawl
# If we don't do this, we could get blocked/banned
# since we don't have permission to crawl.
def can_crawl(url):
parsed_url = urlparse(url)
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
print(f"Checking robots.txt for: {robots_url}")
time.sleep(random.uniform(1, 3))
try:
response = requests.get(robots_url, timeout=5)
response.raise_for_status()
disallowed_paths = []
for line in response.text.splitlines():
if line.startswith("Disallow"):
parts = line.split()
if len(parts) > 1:
disallowed_paths.append(parts[1])
for path in disallowed_paths:
if urlparse(url).path.startswith(path):
print(f"Disallowed by robots.txt: {url}")
return False
return True
except requests.RequestException:
print(f"Failed to access robots.txt: {robots_url}")
return False # If we can't access robots.txt, assume we can't crawl (we're being nice here)
# Function to fetch and parse URL
def crawl(args):
queue = args['queue']
visited_urls = args['visited_urls']
crawl_count = args['crawl_count']
CRAWL_LIMIT = args['CRAWL_LIMIT']
lock = args['lock']
index = args['index']
webpage_info = args['webpage_info']
webpage_id_counter = args['webpage_id_counter']
stop_crawl = args['stop_crawl']
while not stop_crawl.is_set():
try:
current_url = queue.get(timeout=5)
print("Time to crawl: " + current_url)
except Exception:
break # Exit if no more URLs are available to crawl
with lock:
if crawl_count[0] >= CRAWL_LIMIT:
queue.queue.clear() # Clear remaining URLs to stop processing
print("Crawl limit reached. Exiting...")
stop_crawl.set()
break
if current_url in visited_urls:
queue.task_done()
continue
visited_urls.add(current_url)
""" Checks for noindex directive in the page
Comment this out if you don't care about noindex
WARNING: websites could block/ban you if you don't have permission
"""
# if not can_crawl(current_url):
# queue.task_done()
# continue
time.sleep(random.uniform(2, 5))
try:
response = requests.get(current_url, timeout=5)
response.raise_for_status() # Check for request errors
content = response.content
""" Checks for noindex directive in the page
Comment this out if you don't care about noindex
WARNING: websites could block/ban you if you don't have permission
"""
# if 'noindex' in content.decode('utf-8').lower():
# print(f"Noindex found, skipping: {current_url}")
# queue.task_done()
# continue
# Parse the fetched content to find new URLs
webpage = BeautifulSoup(content, "html.parser")
# Index the webpage
indexed_page = index_page(webpage, current_url)
with lock:
for word in indexed_page["words"]:
if word not in index:
index[word] = set()
index[word].add(webpage_id_counter[0])
webpage_info[webpage_id_counter[0]] = indexed_page
webpage_id_counter[0] += 1
hyperlinks = webpage.select("a[href]")
new_urls = parse_links(hyperlinks, current_url)
with lock:
for new_url in new_urls:
if new_url not in visited_urls:
queue.put(new_url)
crawl_count[0] += 1
except requests.RequestException as e:
print(f"Failed to fetch {current_url}: {e}")
finally:
queue.task_done()
# Function to parse links from HTML content
def parse_links(hyperlinks, current_url):
urls = []
for hyperlink in hyperlinks:
url = hyperlink["href"]
# Format the URL into a proper URL
if url.startswith("#"):
continue # Skip same-page anchors
if url.startswith("//"):
url = "https:" + url # Add scheme to protocol-relative URLs
elif url.startswith("/"):
# Construct full URL for relative links
base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url))
url = base_url + url
elif not url.startswith("http"):
continue # Skip non-HTTP links
url = url.split("#")[0] # Remove anchor
urls.append(url)
return urls
# Main crawling function
def sloth_bot():
# Start with the initial pages to crawl
starting_urls = [
"https://www.wikipedia.org/wiki/Google",
"https://www.bbc.com/news/world",
"https://news.ycombinator.com/",
]
urls_to_crawl = Queue()
for seed_url in starting_urls:
urls_to_crawl.put(seed_url)
visited_urls = set() # URL tracking
CRAWL_LIMIT = 20 # Set crawl limit
crawl_count = [0] # Shared counter
lock = threading.Lock() # Thread safety lock
index = {}
webpage_info = {}
webpage_id_counter = [0]
stop_crawl = threading.Event()
# Start concurrent crawling with ThreadPoolExecutor
#Concurrency = speed
#Threads go BRRRRR
#Increase this if you want more threads, but be careful with these.
NUM_WORKERS = 100
#Setting up arguments for the crawl function
args = {
'queue': urls_to_crawl,
'visited_urls': visited_urls,
'crawl_count': crawl_count,
'CRAWL_LIMIT': CRAWL_LIMIT,
'lock': lock,
'index': index,
'webpage_info': webpage_info,
'webpage_id_counter': webpage_id_counter,
'stop_crawl': stop_crawl
}
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
for _ in range(NUM_WORKERS):
executor.submit(crawl, args)
print("All URLs have been crawled")
""" This part is for saving the data to CSV files.
However, if you don't want to save the data, you can remove/comment out this part.
If you want to use a database, you can replace this part with a database connection.
"""
with open('advanced_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['word', 'doc_ids']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for word, doc_ids in index.items():
writer.writerow({'word': word, 'doc_ids': list(doc_ids)})
with open('advanced_doc_info.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['doc_id', 'url', 'title', 'description']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for doc_id, info in webpage_info.items():
writer.writerow({
'doc_id': doc_id,
'url': info['url'],
'title': info['title'],
'description': info['description']
})
def main():
# Start the crawling process
sloth_bot()
if __name__ == "__main__":
main()