Search/search/complete_examples/simple_pagerank.py
The Coding Sloth 7771dffa6e first commit
2024-11-07 17:30:24 -05:00

110 lines
3.9 KiB
Python

from bs4 import BeautifulSoup
import requests
import time
import random
import csv
import sys
import os
# Add the root directory to sys.path
# This is to be able to import modules from other directories (indexing and serving) idk why...
# any imports from indexing/serving need to happen under this
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from indexing.simple_indexing import simple_index_page
from serving.pagerank import compute_pagerank
def sloth_bot():
# Our list of URLs to crawl
urls = ["https://en.wikipedia.org/wiki/Google"]
visited_urls = set()
# Create the index and graph
index = {} # URL -> page contents
pagerank_graph = {} # URL -> set of URLs it links to
CRAWL_LIMIT = 5
crawl_count = 0
# Loops through the list of URLs
while urls and crawl_count < CRAWL_LIMIT:
# Grab the next URL
current_url = urls.pop()
if current_url in visited_urls:
continue
print("Time to crawl: " + current_url)
time.sleep(random.uniform(1, 2))
try:
response = requests.get(current_url)
response.raise_for_status()
except requests.RequestException as e:
print(f"Failed to retrieve {current_url}: {e}")
continue
# Parse the content of the page
webpage = BeautifulSoup(response.content, "html.parser")
# Add the page to the index
indexed_page = simple_index_page(webpage, current_url)
index[current_url] = indexed_page
visited_urls.add(current_url)
# Grab the links from the page
hyperlinks = webpage.select("a[href]")
#This is where we store our connected pages
hyperlink_connections = set()
for hyperlink in hyperlinks:
url = hyperlink["href"]
# Format the URL into a proper URL
if url.startswith("#"):
continue
if url.startswith("//"):
url = "https:" + url
elif url.startswith("/"):
base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url))
url = base_url + url
elif not url.startswith("http"):
continue
url = url.split('#')[0]
#Add to the link connection
hyperlink_connections.add(url)
# If we haven't visited this URL yet, add it to our list
if url not in visited_urls:
urls.append(url)
# Update the page's outgoing links
index[current_url]['hyperlink_connections'] = hyperlink_connections
pagerank_graph[current_url] = hyperlink_connections
crawl_count += 1
print(f"Crawled count: {crawl_count}, index size: {len(index)}, URLs left: {len(urls)}")
# Compute PageRank
pagerank_scores = compute_pagerank(pagerank_graph)
""" This part is for saving the data to CSV files.
However, if you don't want to save the data, you can remove/comment out this part.
If you want to use a database, you can replace this part with a database connection.
"""
with open('simple_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ["url", "title", "description", "pagerank", "words"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for url, info in index.items():
writer.writerow({
'url': url,
'title': info['title'],
'description': info['description'],
'pagerank': pagerank_scores.get(url, 0),
'words': ', '.join(info['words'])
})
def main():
# Start the crawling process
sloth_bot()
if __name__ == "__main__":
main()