Search/search/complete_examples/advanced_pagerank.py
The Coding Sloth 7771dffa6e first commit
2024-11-07 17:30:24 -05:00

239 lines
8.7 KiB
Python

from bs4 import BeautifulSoup
import requests
import time
import random
from queue import Queue
from concurrent.futures import ThreadPoolExecutor
import threading
from urllib.parse import urlparse
import csv
import sys
import os
# Add the root directory to sys.path
# This is to be able to import modules from other directories (indexing and serving) idk why...
# any imports from indexing/serving need to happen under this
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from indexing.advanced_indexing import advanced_index_page
from serving.pagerank import compute_pagerank
# Function to check robots.txt for permission to crawl
# If we don't do this, we could get blocked/banned
# since we don't have permission to crawl.
def can_crawl(url):
parsed_url = urlparse(url)
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
print(f"Checking robots.txt for: {robots_url}")
time.sleep(random.uniform(1, 3))
try:
response = requests.get(robots_url, timeout=5)
response.raise_for_status()
disallowed_paths = []
for line in response.text.splitlines():
if line.startswith("Disallow"):
parts = line.split()
if len(parts) > 1:
disallowed_paths.append(parts[1])
for path in disallowed_paths:
if urlparse(url).path.startswith(path):
print(f"Disallowed by robots.txt: {url}")
return False
return True
except requests.RequestException:
print(f"Failed to access robots.txt: {robots_url}")
return False # If we can't access robots.txt, assume we can't crawl (we're being nice here)
# Function to fetch and parse URL
def crawl(args):
queue = args['queue']
visited_urls = args['visited_urls']
crawl_count = args['crawl_count']
CRAWL_LIMIT = args['CRAWL_LIMIT']
lock = args['lock']
index = args['index']
webpage_info = args['webpage_info']
webpage_id_counter = args['webpage_id_counter']
pagerank_graph = args['pagerank_graph']
stop_crawl = args['stop_crawl']
while not stop_crawl.is_set():
try:
current_url = queue.get(timeout=5)
print("Time to crawl: " + current_url)
except Exception:
break # Exit if no more URLs are available to crawl
with lock:
if crawl_count[0] >= CRAWL_LIMIT:
queue.queue.clear() # Clear remaining URLs to stop processing
print("Crawl limit reached. Exiting...")
stop_crawl.set()
break
if current_url in visited_urls:
queue.task_done()
continue
visited_urls.add(current_url)
""" Checks for noindex directive in the page
Comment this out if you don't care about noindex
WARNING: websites could block/ban you if you don't have permission
"""
if not can_crawl(current_url):
queue.task_done()
continue
time.sleep(random.uniform(2, 5))
try:
response = requests.get(current_url, timeout=5)
response.raise_for_status() # Check for request errors
content = response.content
""" Checks for noindex directive in the page
Comment this out if you don't care about noindex
WARNING: websites could block/ban you if you don't have permission
"""
if 'noindex' in content.decode('utf-8').lower():
print(f"Noindex found, skipping: {current_url}")
queue.task_done()
continue
# Parse the fetched content to find new URLs
webpage = BeautifulSoup(content, "html.parser")
# Index the webpage
indexed_page = advanced_index_page(webpage, current_url)
with lock:
for word in indexed_page["words"]:
if word not in index:
index[word] = set()
index[word].add(webpage_id_counter[0])
webpage_info[webpage_id_counter[0]] = indexed_page
webpage_id_counter[0] += 1
hyperlinks = webpage.select("a[href]")
#NEW: Add hyperlink connections for pagerank
new_urls, hyperlink_connections = parse_links(hyperlinks, current_url)
pagerank_graph[current_url] = hyperlink_connections
with lock:
for new_url in new_urls:
if new_url not in visited_urls:
queue.put(new_url)
crawl_count[0] += 1
except requests.RequestException as e:
print(f"Failed to fetch {current_url}: {e}")
finally:
queue.task_done()
# Function to parse links from HTML content
def parse_links(hyperlinks, current_url):
urls = []
#NEW: Add hyperlink connections for pagerank
hyperlink_connections = set()
for hyperlink in hyperlinks:
url = hyperlink["href"]
# Format the URL into a proper URL
if url.startswith("#"):
continue # Skip same-page anchors
if url.startswith("//"):
url = "https:" + url # Add scheme to protocol-relative URLs
elif url.startswith("/"):
# Construct full URL for relative links
base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url))
url = base_url + url
elif not url.startswith("http"):
continue # Skip non-HTTP links
url = url.split("#")[0] # Remove anchor
hyperlink_connections.add(url)
urls.append(url)
return urls, hyperlink_connections
# Main crawling function
def sloth_bot():
# Start with the initial pages to crawl
starting_urls = [
"https://www.wikipedia.org/wiki/Google",
"https://www.bbc.com/news/world",
"https://news.ycombinator.com/",
]
urls_to_crawl = Queue()
for seed_url in starting_urls:
urls_to_crawl.put(seed_url)
visited_urls = set() # URL tracking
CRAWL_LIMIT = 20 # Set crawl limit
crawl_count = [0] # Shared counter
lock = threading.Lock() # Thread safety lock
index = {}
webpage_info = {}
#NEW: pagerank graph for pagerank.
# This will be used to store the connections between hyperlinks
pagerank_graph = {}
webpage_id_counter = [0]
stop_crawl = threading.Event()
# Start concurrent crawling with ThreadPoolExecutor
#Concurrency = speed
#Threads go BRRRRR
#Increase this if you want more threads, but be careful with these.
NUM_WORKERS = 100
#Setting up arguments for the crawl function
args = {
'queue': urls_to_crawl,
'visited_urls': visited_urls,
'crawl_count': crawl_count,
'CRAWL_LIMIT': CRAWL_LIMIT,
'lock': lock,
'index': index,
'webpage_info': webpage_info,
'webpage_id_counter': webpage_id_counter,
'pagerank_graph': pagerank_graph,
'stop_crawl': stop_crawl
}
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
for _ in range(NUM_WORKERS):
executor.submit(crawl, args)
print("All URLs have been crawled")
#NEW: Computes pagerank
pagerank_scores = compute_pagerank(pagerank_graph)
""" This part is for saving the data to CSV files.
However, if you don't want to save the data, you can remove/comment out this part.
If you want to use a database, you can replace this part with a database connection.
"""
with open('advanced_pagerank_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['word', 'doc_ids']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for word, doc_ids in index.items():
writer.writerow({'word': word, 'doc_ids': list(doc_ids)})
with open('advanced_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['doc_id', 'url', 'title', 'description', 'pagerank']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for doc_id, info in webpage_info.items():
writer.writerow({
'doc_id': doc_id,
'url': info['url'],
'title': info['title'],
'description': info['description'],
'pagerank': pagerank_scores.get(info['url'], 0)
})
# Entry point for the script
def main():
sloth_bot()
if __name__ == "__main__":
main()