commit 7771dffa6ec46cf83ce1efd7686dedbdf8a79406 Author: The Coding Sloth <143575542+The-CodingSloth@users.noreply.github.com> Date: Thu Nov 7 17:30:24 2024 -0500 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ba8d32 --- /dev/null +++ b/.gitignore @@ -0,0 +1,164 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +.DS_Store \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e7e63bb --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 The Coding Sloth + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..496d943 --- /dev/null +++ b/README.md @@ -0,0 +1,93 @@ +# Sloth Search - A Google-like Search Engine Clone + +Sloth Search is a project that aims to recreate Google, including crawling, indexing, and serving results through a user-friendly front-end interface. The project consists of three main components: the Client, Search, and Server. +[Check out the video for a full explanation here](https://youtu.be/WCpimlH0Kck?si=_zFzrb1cxZinWKo3) + +## Project Structure + +The project is divided into the following folders: + +- **Client**: Contains the front-end code, providing a user interface similar to Google search, where users can enter queries and view search results. +- **Search**: Contains the core components of Sloth Search, which replicate the three main parts of Google: + - **Crawling**: The web crawler that collects information from the web. + - **Indexing**: Processing and storing the content collected by the crawler for efficient searching. + - **Serving (PageRank)**: Serving search results based on their relevance and PageRank algorithm. +- **Server**: Contains the search API used to handle client requests and provide search results. + +## Installation and Setup + +1. **Clone the Repository** + + ```sh + git clone + cd sloth-search + ``` + +2. ## Install the necessary Python dependencies, run: + +```sh +pip install -r requirements.txt +``` + +3. **Client Setup** + + - The client contains the HTML, CSS, and JavaScript code to run the front-end. + - Open the `index.html` file in your browser, or use a static file server to serve the client code locally. + - You can also use the live server extension. + +4. **Search Setup** + +- The `Search` directory contains the code for crawling, indexing, and serving. +- You can start the process by running: + ```sh + python search/complete_exmaples/advanced_pagerank.py + ``` +- This will crawl, index, and prepare the content for searching. +- If you want to run any other files do the same process: + +```sh +python search/ +``` + +4. **Search Setup** + - The server uses Flask to provide an API for search queries. + - Start the Flask server by navigating to the `Server` directory and running: + ```sh + python google_search_api.py + ``` + +## How It Works + +1. **Crawling** + + - The crawler starts with a set of seed URLs and collects links and content from the web. + - It respects `robots.txt` to avoid being blocked and to ensure ethical crawling. + - Parsed data is stored in a format ready for indexing. + +2. **Indexing** + + - The indexing module processes the crawled pages. + - The content is tokenized, cleaned, stemmed, and stop words are removed using the NLTK library. + - The resulting indexed data is saved to be used by the search API. + +3. **Serving and PageRank** + - The PageRank algorithm is used to rank pages based on their importance. + - When a user searches for a query through the client, the server uses the indexed data and PageRank scores to return the most relevant pages. + +## Important Notes + +- **Respecting Websites**: The crawler respects `robots.txt` rules. Please make sure not to overload any websites. +- **PageRank Algorithm**: The implementation of the PageRank algorithm uses an iterative approach to rank pages based on the links. +- **Data Storage**: The crawler and indexer use CSV files for data storage (`advanced_pagerank_inverted_index.csv` and `advanced_pagerank.csv`). Make sure these files are writable during execution. + +## Contributing + +Contributions are welcome! If you'd like to contribute to the development of Sloth Search, feel free to fork the repository, make changes, and submit a pull request. + +## License + +This project is open-source and available under the MIT License. + +If you have any questions or suggestions, feel free to contact me. + +Happy Searching with Sloth Search! 🦥🔍 diff --git a/client/images/google_camera.svg b/client/images/google_camera.svg new file mode 100644 index 0000000..c064e9e --- /dev/null +++ b/client/images/google_camera.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/client/images/google_mic.svg b/client/images/google_mic.svg new file mode 100644 index 0000000..1ca386e --- /dev/null +++ b/client/images/google_mic.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/client/images/google_search_icon.svg b/client/images/google_search_icon.svg new file mode 100644 index 0000000..6e03b6c --- /dev/null +++ b/client/images/google_search_icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/client/images/sloth_search.png b/client/images/sloth_search.png new file mode 100644 index 0000000..68260d0 Binary files /dev/null and b/client/images/sloth_search.png differ diff --git a/client/index.html b/client/index.html new file mode 100644 index 0000000..8d9561c --- /dev/null +++ b/client/index.html @@ -0,0 +1,77 @@ + + + + + + am real programmer + + + + +
+
+ About + Store +
+
+ Gmail + Images + + +
+
+
+ + + +
+
+ + + + +
+
+ + +
+
+
+ + + diff --git a/client/search.html b/client/search.html new file mode 100644 index 0000000..16b39e5 --- /dev/null +++ b/client/search.html @@ -0,0 +1,119 @@ + + + + + + Search Results - My Search Engine + + + +
+ + + + +
+
+ + + + +
+
+
+
+ + + + diff --git a/client/styles.css b/client/styles.css new file mode 100644 index 0000000..89c3926 --- /dev/null +++ b/client/styles.css @@ -0,0 +1,247 @@ +* { + margin: 0; + padding: 0; + + font-family: 'Roboto', sans-serif; +} + +body { + display: flex; + flex-direction: column; + min-height: 100vh; + /* ensures the body takes up at least the full viewport height */ +} + +a { + all: unset; + text-decoration: none; + /* no underline */ +} + +.top-section { + padding: 1rem; + display: flex; + justify-content: space-between; +} + +.app-icon { + width: 1.5rem; + height: 1.5rem; +} + +.profile-pic { + width: 2rem; + height: 2rem; + border-radius: 100%; +} + +.left-side { + display: flex; + gap: 1.5rem; +} + +.right-side { + display: flex; + gap: 1.5rem; + justify-content: center; + align-items: center; +} + +.left-side a, +.right-side a { + color: #202124; + font-size: 0.8rem; +} + +.middle-section { + flex-grow: 1; + display: flex; + flex-direction: column; + justify-content: center; + align-items: center; + padding: 1rem 0; + gap: 1.2rem; +} + +.search-label { + display: none; +} + +.search-form { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + gap: 2.5rem; +} +.result-search-form { + flex-direction: column; + align-items: center; + justify-content: center; + gap: 2.5rem; +} + +.search-form-input { + display: flex; + align-items: center; + justify-content: center; + gap: 1rem; + border: 1px solid #dfe1e5; + border-radius: 30px; + padding: 0.3rem 1.5rem; + box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.1); +} + +.search-form input { + width: 27rem; + padding: 0.5rem; + border: none; + outline: none; +} + +.buttons { + display: flex; + gap: 1rem; +} + +.search-form button { + border: 1px solid #f8f9fa; + padding: 0.5rem 1rem; + background-color: #f8f9fa; + font-size: 0.9rem; +} +.search-icon-home { + width: 1rem; + height: 1rem; +} +.search-icon-result { + width: 1.5rem; + height: 1.5rem; +} +.mic, +.camera { + width: 1.5rem; + height: 1.5rem; +} + +.bottom-section { + margin-top: 15rem; + padding: 1rem; + display: flex; + justify-content: space-between; + align-items: center; + background-color: #f2f2f2; + font-size: 0.9em; + padding-left: 2rem; + padding-right: 2rem; +} + +.bottom-left, +.bottom-right { + display: flex; + gap: 1.8rem; +} + +.bottom-middle { + padding-right: 10rem; +} + +.bottom-section a { + color: #70757a; +} + +.search-form button { + background-color: #f8f9fa; + border: 1px solid #f8f9fa; + border-radius: 4px; + color: #3c4043; + font-family: Roboto, arial, sans-serif; + font-size: 14px; + margin: 11px 4px; + padding: 0 16px; + line-height: 27px; + height: 36px; + min-width: 54px; + text-align: center; + cursor: pointer; + user-select: none; +} + +.bottom-section { + display: flex; + justify-content: space-between; + align-items: center; + background-color: #f2f2f2; + padding: 1rem 1.5rem; + margin-top: 15rem; +} + +.bottom-section a { + margin: 0 1rem; +} + +.bottom-middle { + margin-right: 8rem; +} + +.search-result-area { + display: flex; + padding-left: 1rem; + gap: 1rem; +} +.search-logo-home { + width: 20rem; +} +.search-logo-result { + width: 7rem; +} + +#results { + padding-top: 1rem; + display: flex; + flex-direction: column; + gap: 1rem; + padding-left: 2rem; + padding-right: 2rem; +} +.result:hover { + cursor: pointer; +} + +.result-description { + font-size: 0.8rem; + width: 50%; + color: #545454; +} +.result { + margin-bottom: 20px; +} +.result-title { + font-size: 18px; + color: #1a0dab; + text-decoration: none; +} +.result-title:hover { + text-decoration: underline; +} +.result-url { + font-size: 14px; + color: #006621; +} +#pagination { + display: flex; + justify-content: center; + align-items: center; + gap: 1.5rem; + padding: 2rem; + font-size: 1.2rem; +} + +#pagination a { + color: #1a0dab; +} + +#pagination a:hover { + text-decoration: underline; + cursor: pointer; +} diff --git a/search/complete_examples/advanced_pagerank.py b/search/complete_examples/advanced_pagerank.py new file mode 100644 index 0000000..f7128a5 --- /dev/null +++ b/search/complete_examples/advanced_pagerank.py @@ -0,0 +1,239 @@ + +from bs4 import BeautifulSoup +import requests +import time +import random +from queue import Queue +from concurrent.futures import ThreadPoolExecutor +import threading +from urllib.parse import urlparse +import csv +import sys +import os +# Add the root directory to sys.path +# This is to be able to import modules from other directories (indexing and serving) idk why... +# any imports from indexing/serving need to happen under this +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from indexing.advanced_indexing import advanced_index_page +from serving.pagerank import compute_pagerank + + +# Function to check robots.txt for permission to crawl +# If we don't do this, we could get blocked/banned +# since we don't have permission to crawl. +def can_crawl(url): + parsed_url = urlparse(url) + robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" + print(f"Checking robots.txt for: {robots_url}") + time.sleep(random.uniform(1, 3)) + try: + response = requests.get(robots_url, timeout=5) + response.raise_for_status() + disallowed_paths = [] + for line in response.text.splitlines(): + if line.startswith("Disallow"): + parts = line.split() + if len(parts) > 1: + disallowed_paths.append(parts[1]) + for path in disallowed_paths: + if urlparse(url).path.startswith(path): + print(f"Disallowed by robots.txt: {url}") + return False + return True + except requests.RequestException: + print(f"Failed to access robots.txt: {robots_url}") + return False # If we can't access robots.txt, assume we can't crawl (we're being nice here) + +# Function to fetch and parse URL +def crawl(args): + queue = args['queue'] + visited_urls = args['visited_urls'] + crawl_count = args['crawl_count'] + CRAWL_LIMIT = args['CRAWL_LIMIT'] + lock = args['lock'] + index = args['index'] + webpage_info = args['webpage_info'] + webpage_id_counter = args['webpage_id_counter'] + pagerank_graph = args['pagerank_graph'] + stop_crawl = args['stop_crawl'] + + while not stop_crawl.is_set(): + try: + current_url = queue.get(timeout=5) + print("Time to crawl: " + current_url) + except Exception: + break # Exit if no more URLs are available to crawl + + with lock: + if crawl_count[0] >= CRAWL_LIMIT: + queue.queue.clear() # Clear remaining URLs to stop processing + print("Crawl limit reached. Exiting...") + stop_crawl.set() + break + if current_url in visited_urls: + queue.task_done() + continue + visited_urls.add(current_url) + + """ Checks for noindex directive in the page + Comment this out if you don't care about noindex + WARNING: websites could block/ban you if you don't have permission + """ + if not can_crawl(current_url): + queue.task_done() + continue + + time.sleep(random.uniform(2, 5)) + try: + response = requests.get(current_url, timeout=5) + response.raise_for_status() # Check for request errors + content = response.content + + """ Checks for noindex directive in the page + Comment this out if you don't care about noindex + WARNING: websites could block/ban you if you don't have permission + """ + if 'noindex' in content.decode('utf-8').lower(): + print(f"Noindex found, skipping: {current_url}") + queue.task_done() + continue + + + # Parse the fetched content to find new URLs + webpage = BeautifulSoup(content, "html.parser") + + # Index the webpage + indexed_page = advanced_index_page(webpage, current_url) + with lock: + for word in indexed_page["words"]: + if word not in index: + index[word] = set() + index[word].add(webpage_id_counter[0]) + webpage_info[webpage_id_counter[0]] = indexed_page + webpage_id_counter[0] += 1 + + hyperlinks = webpage.select("a[href]") + #NEW: Add hyperlink connections for pagerank + new_urls, hyperlink_connections = parse_links(hyperlinks, current_url) + pagerank_graph[current_url] = hyperlink_connections + + with lock: + for new_url in new_urls: + if new_url not in visited_urls: + queue.put(new_url) + crawl_count[0] += 1 + + except requests.RequestException as e: + print(f"Failed to fetch {current_url}: {e}") + finally: + queue.task_done() + +# Function to parse links from HTML content +def parse_links(hyperlinks, current_url): + urls = [] + #NEW: Add hyperlink connections for pagerank + hyperlink_connections = set() + for hyperlink in hyperlinks: + url = hyperlink["href"] + + # Format the URL into a proper URL + if url.startswith("#"): + continue # Skip same-page anchors + if url.startswith("//"): + url = "https:" + url # Add scheme to protocol-relative URLs + elif url.startswith("/"): + # Construct full URL for relative links + base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) + url = base_url + url + elif not url.startswith("http"): + continue # Skip non-HTTP links + url = url.split("#")[0] # Remove anchor + + hyperlink_connections.add(url) + urls.append(url) + return urls, hyperlink_connections + +# Main crawling function +def sloth_bot(): + # Start with the initial pages to crawl + starting_urls = [ + "https://www.wikipedia.org/wiki/Google", + "https://www.bbc.com/news/world", + "https://news.ycombinator.com/", + ] + + urls_to_crawl = Queue() + for seed_url in starting_urls: + urls_to_crawl.put(seed_url) + + visited_urls = set() # URL tracking + CRAWL_LIMIT = 20 # Set crawl limit + crawl_count = [0] # Shared counter + lock = threading.Lock() # Thread safety lock + index = {} + webpage_info = {} + #NEW: pagerank graph for pagerank. + # This will be used to store the connections between hyperlinks + pagerank_graph = {} + webpage_id_counter = [0] + stop_crawl = threading.Event() + + # Start concurrent crawling with ThreadPoolExecutor + #Concurrency = speed + #Threads go BRRRRR + #Increase this if you want more threads, but be careful with these. + NUM_WORKERS = 100 + #Setting up arguments for the crawl function + args = { + 'queue': urls_to_crawl, + 'visited_urls': visited_urls, + 'crawl_count': crawl_count, + 'CRAWL_LIMIT': CRAWL_LIMIT, + 'lock': lock, + 'index': index, + 'webpage_info': webpage_info, + 'webpage_id_counter': webpage_id_counter, + 'pagerank_graph': pagerank_graph, + 'stop_crawl': stop_crawl + } + + with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: + for _ in range(NUM_WORKERS): + executor.submit(crawl, args) + + print("All URLs have been crawled") + + #NEW: Computes pagerank + pagerank_scores = compute_pagerank(pagerank_graph) + + + """ This part is for saving the data to CSV files. + However, if you don't want to save the data, you can remove/comment out this part. + If you want to use a database, you can replace this part with a database connection. + """ + with open('advanced_pagerank_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['word', 'doc_ids'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for word, doc_ids in index.items(): + writer.writerow({'word': word, 'doc_ids': list(doc_ids)}) + + with open('advanced_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['doc_id', 'url', 'title', 'description', 'pagerank'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for doc_id, info in webpage_info.items(): + writer.writerow({ + 'doc_id': doc_id, + 'url': info['url'], + 'title': info['title'], + 'description': info['description'], + 'pagerank': pagerank_scores.get(info['url'], 0) + }) + +# Entry point for the script +def main(): + sloth_bot() + +if __name__ == "__main__": + main() diff --git a/search/complete_examples/simple_pagerank.py b/search/complete_examples/simple_pagerank.py new file mode 100644 index 0000000..af42cdb --- /dev/null +++ b/search/complete_examples/simple_pagerank.py @@ -0,0 +1,110 @@ +from bs4 import BeautifulSoup +import requests +import time +import random +import csv +import sys +import os +# Add the root directory to sys.path +# This is to be able to import modules from other directories (indexing and serving) idk why... +# any imports from indexing/serving need to happen under this +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from indexing.simple_indexing import simple_index_page +from serving.pagerank import compute_pagerank + +def sloth_bot(): + # Our list of URLs to crawl + urls = ["https://en.wikipedia.org/wiki/Google"] + visited_urls = set() + + # Create the index and graph + index = {} # URL -> page contents + pagerank_graph = {} # URL -> set of URLs it links to + CRAWL_LIMIT = 5 + crawl_count = 0 + + # Loops through the list of URLs + while urls and crawl_count < CRAWL_LIMIT: + # Grab the next URL + current_url = urls.pop() + if current_url in visited_urls: + continue + print("Time to crawl: " + current_url) + time.sleep(random.uniform(1, 2)) + try: + response = requests.get(current_url) + response.raise_for_status() + except requests.RequestException as e: + print(f"Failed to retrieve {current_url}: {e}") + continue + + # Parse the content of the page + webpage = BeautifulSoup(response.content, "html.parser") + + # Add the page to the index + indexed_page = simple_index_page(webpage, current_url) + index[current_url] = indexed_page + visited_urls.add(current_url) + + # Grab the links from the page + hyperlinks = webpage.select("a[href]") + #This is where we store our connected pages + hyperlink_connections = set() + for hyperlink in hyperlinks: + url = hyperlink["href"] + # Format the URL into a proper URL + if url.startswith("#"): + continue + if url.startswith("//"): + url = "https:" + url + elif url.startswith("/"): + base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) + url = base_url + url + elif not url.startswith("http"): + continue + url = url.split('#')[0] + #Add to the link connection + hyperlink_connections.add(url) + # If we haven't visited this URL yet, add it to our list + if url not in visited_urls: + urls.append(url) + + # Update the page's outgoing links + index[current_url]['hyperlink_connections'] = hyperlink_connections + pagerank_graph[current_url] = hyperlink_connections + + crawl_count += 1 + print(f"Crawled count: {crawl_count}, index size: {len(index)}, URLs left: {len(urls)}") + + # Compute PageRank + pagerank_scores = compute_pagerank(pagerank_graph) + + """ This part is for saving the data to CSV files. + However, if you don't want to save the data, you can remove/comment out this part. + If you want to use a database, you can replace this part with a database connection. + """ + + with open('simple_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ["url", "title", "description", "pagerank", "words"] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for url, info in index.items(): + writer.writerow({ + 'url': url, + 'title': info['title'], + 'description': info['description'], + 'pagerank': pagerank_scores.get(url, 0), + 'words': ', '.join(info['words']) + }) + + + +def main(): + # Start the crawling process + sloth_bot() + +if __name__ == "__main__": + main() + + + diff --git a/search/crawling/advanced_crawler.py b/search/crawling/advanced_crawler.py new file mode 100644 index 0000000..ff217a0 --- /dev/null +++ b/search/crawling/advanced_crawler.py @@ -0,0 +1,224 @@ +from bs4 import BeautifulSoup +import requests +import time +import random +from queue import Queue +from concurrent.futures import ThreadPoolExecutor +import threading +from urllib.parse import urlparse +import csv +from indexing.advanced_indexing import index_page +import sys +import os +# Add the root directory to sys.path +# This is to be able to import modules from other directories (indexing and serving) idk why... +# any imports from indexing/serving need to happen under this +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +# Function to check robots.txt for permission to crawl +# If we don't do this, we could get blocked/banned +# since we don't have permission to crawl. +def can_crawl(url): + parsed_url = urlparse(url) + robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" + print(f"Checking robots.txt for: {robots_url}") + time.sleep(random.uniform(1, 3)) + try: + response = requests.get(robots_url, timeout=5) + response.raise_for_status() + disallowed_paths = [] + for line in response.text.splitlines(): + if line.startswith("Disallow"): + parts = line.split() + if len(parts) > 1: + disallowed_paths.append(parts[1]) + for path in disallowed_paths: + if urlparse(url).path.startswith(path): + print(f"Disallowed by robots.txt: {url}") + return False + return True + except requests.RequestException: + print(f"Failed to access robots.txt: {robots_url}") + return False # If we can't access robots.txt, assume we can't crawl (we're being nice here) + +# Function to fetch and parse URL +def crawl(args): + queue = args['queue'] + visited_urls = args['visited_urls'] + crawl_count = args['crawl_count'] + CRAWL_LIMIT = args['CRAWL_LIMIT'] + lock = args['lock'] + index = args['index'] + webpage_info = args['webpage_info'] + webpage_id_counter = args['webpage_id_counter'] + stop_crawl = args['stop_crawl'] + + while not stop_crawl.is_set(): + try: + current_url = queue.get(timeout=5) + print("Time to crawl: " + current_url) + except Exception: + break # Exit if no more URLs are available to crawl + + with lock: + if crawl_count[0] >= CRAWL_LIMIT: + queue.queue.clear() # Clear remaining URLs to stop processing + print("Crawl limit reached. Exiting...") + stop_crawl.set() + break + if current_url in visited_urls: + queue.task_done() + continue + visited_urls.add(current_url) + + """ Checks for noindex directive in the page + Comment this out if you don't care about noindex + WARNING: websites could block/ban you if you don't have permission + """ + # if not can_crawl(current_url): + # queue.task_done() + # continue + + time.sleep(random.uniform(2, 5)) + try: + response = requests.get(current_url, timeout=5) + response.raise_for_status() # Check for request errors + content = response.content + + """ Checks for noindex directive in the page + Comment this out if you don't care about noindex + WARNING: websites could block/ban you if you don't have permission + """ + # if 'noindex' in content.decode('utf-8').lower(): + # print(f"Noindex found, skipping: {current_url}") + # queue.task_done() + # continue + + + # Parse the fetched content to find new URLs + webpage = BeautifulSoup(content, "html.parser") + + # Index the webpage + indexed_page = index_page(webpage, current_url) + with lock: + for word in indexed_page["words"]: + if word not in index: + index[word] = set() + index[word].add(webpage_id_counter[0]) + webpage_info[webpage_id_counter[0]] = indexed_page + webpage_id_counter[0] += 1 + + hyperlinks = webpage.select("a[href]") + new_urls = parse_links(hyperlinks, current_url) + + with lock: + for new_url in new_urls: + if new_url not in visited_urls: + queue.put(new_url) + crawl_count[0] += 1 + + except requests.RequestException as e: + print(f"Failed to fetch {current_url}: {e}") + finally: + queue.task_done() + +# Function to parse links from HTML content +def parse_links(hyperlinks, current_url): + urls = [] + for hyperlink in hyperlinks: + url = hyperlink["href"] + + # Format the URL into a proper URL + if url.startswith("#"): + continue # Skip same-page anchors + if url.startswith("//"): + url = "https:" + url # Add scheme to protocol-relative URLs + elif url.startswith("/"): + # Construct full URL for relative links + base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) + url = base_url + url + elif not url.startswith("http"): + continue # Skip non-HTTP links + url = url.split("#")[0] # Remove anchor + urls.append(url) + return urls + +# Main crawling function +def sloth_bot(): + # Start with the initial pages to crawl + starting_urls = [ + "https://www.wikipedia.org/wiki/Google", + "https://www.bbc.com/news/world", + "https://news.ycombinator.com/", + ] + + urls_to_crawl = Queue() + for seed_url in starting_urls: + urls_to_crawl.put(seed_url) + + visited_urls = set() # URL tracking + CRAWL_LIMIT = 20 # Set crawl limit + crawl_count = [0] # Shared counter + lock = threading.Lock() # Thread safety lock + index = {} + webpage_info = {} + webpage_id_counter = [0] + stop_crawl = threading.Event() + + # Start concurrent crawling with ThreadPoolExecutor + #Concurrency = speed + #Threads go BRRRRR + #Increase this if you want more threads, but be careful with these. + NUM_WORKERS = 100 + #Setting up arguments for the crawl function + args = { + 'queue': urls_to_crawl, + 'visited_urls': visited_urls, + 'crawl_count': crawl_count, + 'CRAWL_LIMIT': CRAWL_LIMIT, + 'lock': lock, + 'index': index, + 'webpage_info': webpage_info, + 'webpage_id_counter': webpage_id_counter, + 'stop_crawl': stop_crawl + } + + with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: + for _ in range(NUM_WORKERS): + executor.submit(crawl, args) + + print("All URLs have been crawled") + + + """ This part is for saving the data to CSV files. + However, if you don't want to save the data, you can remove/comment out this part. + If you want to use a database, you can replace this part with a database connection. + """ + with open('advanced_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['word', 'doc_ids'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for word, doc_ids in index.items(): + writer.writerow({'word': word, 'doc_ids': list(doc_ids)}) + + with open('advanced_doc_info.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['doc_id', 'url', 'title', 'description'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for doc_id, info in webpage_info.items(): + writer.writerow({ + 'doc_id': doc_id, + 'url': info['url'], + 'title': info['title'], + 'description': info['description'] + }) + +def main(): + # Start the crawling process + sloth_bot() + +if __name__ == "__main__": + main() + + + diff --git a/search/crawling/simple_crawler.py b/search/crawling/simple_crawler.py new file mode 100644 index 0000000..7c80ce7 --- /dev/null +++ b/search/crawling/simple_crawler.py @@ -0,0 +1,65 @@ +from bs4 import BeautifulSoup +import requests +import time +import random + +def sloth_bot(): + # our list of URLs to crawl + urls = ["https://en.wikipedia.org/wiki/Google"] + visited_urls = set() + #timer to see how long it takes to crawl + start = time.time() + #Loops through the list of urls + CRAWL_LIMIT = 15 + current_crawl_count = 0 + + while urls and current_crawl_count < CRAWL_LIMIT: + # grabs the next url + current_url = urls.pop(0) + print("time to crawl: " + current_url) + time.sleep(random.uniform(1, 3)) + try: + response = requests.get(current_url) + response.raise_for_status() + except requests.RequestException as e: + print(f"Failed to retrieve {current_url}: {e}") + continue + + # grabbing the content of the page + webpage = BeautifulSoup(response.content, "html.parser") + + # grabbing the links from the page + hyperlinks = webpage.select("a[href]") + # looping through the links and adding them to our list of urls + for hyperlink in hyperlinks: + url = hyperlink["href"] + #Formats the url into a proper url (don't worry about this) + if url.startswith("#"): + continue + if url.startswith("//"): + url = "https:" + url + elif url.startswith("/"): + base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) + url = base_url + url + elif not url.startswith("http"): + continue + # + url = url.split('#')[0] + + #if we haven't visited this url yet, add it to our list + if url not in visited_urls: + urls.append(url) + visited_urls.add(url) + + current_crawl_count += 1 + + +def main(): + # Start the crawling process + sloth_bot() + +if __name__ == "__main__": + main() + + + diff --git a/search/indexing/__init__.py b/search/indexing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/search/indexing/advanced_indexing.py b/search/indexing/advanced_indexing.py new file mode 100644 index 0000000..5e4e98f --- /dev/null +++ b/search/indexing/advanced_indexing.py @@ -0,0 +1,73 @@ +import nltk +import ssl +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +from nltk.tokenize import word_tokenize + +try: + _create_unverified_https_context = ssl._create_unverified_context +except AttributeError: + pass +else: + ssl._create_default_https_context = _create_unverified_https_context +nltk.download('stopwords') +nltk.download('punkt_tab') +try: + _create_unverified_https_context = ssl._create_unverified_context +except AttributeError: + pass +else: + ssl._create_default_https_context = _create_unverified_https_context + +# Download NLTK data only if not already downloaded +def download_nltk_resources(): + try: + stopwords.words('english') + except LookupError: + nltk.download('stopwords') + try: + word_tokenize('test') + except LookupError: + nltk.download('punkt') +#Function that indexes the webpage +def advanced_index_page(webpage, webpage_url): + #Download NLTK data only if not already downloaded + download_nltk_resources() + + # Initialize NLTK components + stop_words = set(stopwords.words('english')) + ps = PorterStemmer() + #Collect title and description + title_tag = webpage.find('title') + title = title_tag.get_text().strip() if title_tag else 'No Title' + + #Collect description + description = '' + meta_description = webpage.find('meta', attrs={'name': 'description'}) + if meta_description and 'content' in meta_description.attrs: + description = meta_description['content'] + else: + text_content = webpage.get_text(separator=" ", strip=True) + description = text_content[:200] + "..." if len(text_content) > 200 else text_content + + + # Grab ALL the words in the page. + text_content = webpage.get_text(separator=' ', strip=True) + #Splitting them into the individual words + tokens = word_tokenize(text_content.lower()) + #Big brain techniques 2 and 3 + #Stemming the words and removing stop words. + filtered_words = [ + ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words + ] + + #Add the information to the index + indexed_page = { + "url": webpage_url, + "title": title, + "description": description, + "words": filtered_words + } + #If you want to print the results + #print(f"Indexed: {webpage_url}. \n Here's the info: \n title: {title} \n description: {description} \n number of words: {len(filtered_words)} \n") + return indexed_page diff --git a/search/indexing/simple_indexing.py b/search/indexing/simple_indexing.py new file mode 100644 index 0000000..13845fe --- /dev/null +++ b/search/indexing/simple_indexing.py @@ -0,0 +1,34 @@ +import re + +def simple_index_page(webpage, webpage_url): + + #Collect title and description + title_tag = webpage.find('title') + title = title_tag.get_text().strip() if title_tag else 'No Title' + + #Collect description + description = '' + meta_description = webpage.find('meta', attrs={'name': 'description'}) + if meta_description and 'content' in meta_description.attrs: + description = meta_description['content'] + else: + text_content = webpage.get_text(separator=" ", strip=True) + description = text_content[:200] + "..." if len(text_content) > 200 else text_content + + #Grab ALL the words in the page + #regex disgusting... + words = re.findall(r'\b\w+\b', webpage.get_text(separator=" ", strip=True).lower()) + + #Double check and filter out any numbers, symbols, etc. + #WE ONLY WANT WORDS + words = [word for word in words if word.isalpha()] + + #Add the information to the index + indexed_page = { + "url": webpage_url, + "title": title, + "description": description, + "words": words + } + print(f"Indexed: {webpage_url}. \n Here's the info: \n title: {title} \n description: {description} \n number of words: {len(words)} \n") + return indexed_page \ No newline at end of file diff --git a/search/serving/pagerank.py b/search/serving/pagerank.py new file mode 100644 index 0000000..16c1deb --- /dev/null +++ b/search/serving/pagerank.py @@ -0,0 +1,34 @@ + + +def compute_pagerank(graph, damping_factor=0.85, max_iterations=100, tol=1.0e-6): + # Build the set of all URLs + all_nodes = set(graph.keys()) + for links in graph.values(): + all_nodes.update(links) + num_nodes = len(all_nodes) + # Initialize PageRank scores + pagerank = {url: 1.0 / num_nodes for url in all_nodes} + # Identify dangling nodes (nodes with no outgoing links) + dangling_nodes = [url for url in all_nodes if url not in graph or len(graph[url]) == 0] + # Iterative computation + for iteration in range(max_iterations): + new_pagerank = {} + # Sum of PageRank scores from dangling nodes + dangling_sum = damping_factor * sum(pagerank[node] for node in dangling_nodes) / num_nodes + for url in all_nodes: + rank = (1.0 - damping_factor) / num_nodes + rank += dangling_sum + # Sum contributions from incoming links + for node in graph: + if url in graph[node]: + out_degree = len(graph[node]) + rank += damping_factor * pagerank[node] / out_degree + new_pagerank[url] = rank + # Check for convergence + error = sum(abs(new_pagerank[url] - pagerank[url]) for url in all_nodes) + if error < tol: + break + pagerank = new_pagerank + for url in all_nodes: + pagerank[url] = round(pagerank[url], 6) + return pagerank diff --git a/server/google_search_api.py b/server/google_search_api.py new file mode 100644 index 0000000..3542e09 --- /dev/null +++ b/server/google_search_api.py @@ -0,0 +1,136 @@ +from flask import Flask, request, jsonify +import csv +import nltk +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +from nltk.tokenize import word_tokenize +import ssl +from flask_cors import CORS +app = Flask(__name__) + + +CORS(app) + +# NLTK setup (handles SSL certificate issues) +try: + _create_unverified_https_context = ssl._create_unverified_context +except AttributeError: + pass +else: + ssl._create_default_https_context = _create_unverified_https_context + +# Download NLTK data only if not already downloaded +def download_nltk_resources(): + try: + stopwords.words('english') + except LookupError: + nltk.download('stopwords') + try: + word_tokenize('test') + except LookupError: + nltk.download('punkt') + +# Initialize NLTK components +download_nltk_resources() +stop_words = set(stopwords.words('english')) +ps = PorterStemmer() + + +def load_inverted_index(file_path): + inverted_index = {} + with open(file_path, 'r', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + word = row['word'] + doc_ids_str = row['doc_ids'].strip("[]") # Remove brackets + doc_ids_list = doc_ids_str.split(', ') if doc_ids_str else [] + doc_ids = set(int(doc_id) for doc_id in doc_ids_list) + inverted_index[word] = doc_ids + return inverted_index + +def load_document_info(file_path): + document_info = {} + with open(file_path, 'r', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + doc_id = int(row['doc_id']) + document_info[doc_id] = { + 'url': row['url'], + 'title': row['title'], + 'description': row['description'], + 'pagerank': float(row['pagerank']) + } + return document_info + +def parse_query(query): + # Tokenize the query + tokens = word_tokenize(query.lower()) + # Remove non-alphabetic tokens and stop words, then stem the words + query_words = [ + ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words + ] + return query_words + +def search(query, inverted_index, document_info, num_results=10, page=1): + query_words = parse_query(query) + if not query_words: + return [] + # Find documents that contain any of the query words + matched_doc_ids = set() + for word in query_words: + if word in inverted_index: + matched_doc_ids.update(inverted_index[word]) + if not matched_doc_ids: + return [] + # Retrieve documents and their PageRank scores + results = [] + for doc_id in matched_doc_ids: + info = document_info[doc_id] + results.append({ + 'doc_id': doc_id, + 'url': info['url'], + 'title': info['title'], + 'description': info['description'], + 'pagerank': info['pagerank'] + }) + # Sort documents by PageRank score + sorted_results = sorted(results, key=lambda x: x['pagerank'], reverse=True) + # Pagination + start = (page - 1) * num_results + end = start + num_results + paginated_results = sorted_results[start:end] + return paginated_results + +# Load the inverted index and document info +# If you are using a different file, replace the path with the path to your file +#If you're using a database, replace this with the code to connect to your database +try: + inverted_index = load_inverted_index('../search/complete_examples/advanced_pagerank_inverted_index.csv') + document_info = load_document_info('../search/complete_examples/advanced_pagerank.csv') +except FileNotFoundError: + try: + inverted_index = load_inverted_index("../advanced_pagerank_inverted_index.csv") + document_info = load_document_info("../advanced_pagerank.csv") + except FileNotFoundError: + print("Error: Files not found, run the advanced_pagerank.py file first") + print("Exiting...") + exit() + + +@app.route('/search') +def search_api(): + query = request.args.get('q', '') + num_results = int(request.args.get('num_results', 10)) + page = int(request.args.get('page', 1)) + if not query: + return jsonify({'error': 'No query provided'}), 400 + results = search(query, inverted_index, document_info, num_results=num_results, page=page) + return jsonify({ + 'query': query, + 'page': page, + 'num_results': num_results, + 'results': results + }) + +if __name__ == '__main__': + app.run(debug=True) \ No newline at end of file