mirror of
https://github.com/The-CodingSloth/sloth-search.git
synced 2025-12-19 09:54:08 +00:00
73 lines
2.5 KiB
Python
73 lines
2.5 KiB
Python
import nltk
|
|
import ssl
|
|
from nltk.corpus import stopwords
|
|
from nltk.stem import PorterStemmer
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
try:
|
|
_create_unverified_https_context = ssl._create_unverified_context
|
|
except AttributeError:
|
|
pass
|
|
else:
|
|
ssl._create_default_https_context = _create_unverified_https_context
|
|
nltk.download('stopwords')
|
|
nltk.download('punkt_tab')
|
|
try:
|
|
_create_unverified_https_context = ssl._create_unverified_context
|
|
except AttributeError:
|
|
pass
|
|
else:
|
|
ssl._create_default_https_context = _create_unverified_https_context
|
|
|
|
# Download NLTK data only if not already downloaded
|
|
def download_nltk_resources():
|
|
try:
|
|
stopwords.words('english')
|
|
except LookupError:
|
|
nltk.download('stopwords')
|
|
try:
|
|
word_tokenize('test')
|
|
except LookupError:
|
|
nltk.download('punkt')
|
|
#Function that indexes the webpage
|
|
def advanced_index_page(webpage, webpage_url):
|
|
#Download NLTK data only if not already downloaded
|
|
download_nltk_resources()
|
|
|
|
# Initialize NLTK components
|
|
stop_words = set(stopwords.words('english'))
|
|
ps = PorterStemmer()
|
|
#Collect title and description
|
|
title_tag = webpage.find('title')
|
|
title = title_tag.get_text().strip() if title_tag else 'No Title'
|
|
|
|
#Collect description
|
|
description = ''
|
|
meta_description = webpage.find('meta', attrs={'name': 'description'})
|
|
if meta_description and 'content' in meta_description.attrs:
|
|
description = meta_description['content']
|
|
else:
|
|
text_content = webpage.get_text(separator=" ", strip=True)
|
|
description = text_content[:200] + "..." if len(text_content) > 200 else text_content
|
|
|
|
|
|
# Grab ALL the words in the page.
|
|
text_content = webpage.get_text(separator=' ', strip=True)
|
|
#Splitting them into the individual words
|
|
tokens = word_tokenize(text_content.lower())
|
|
#Big brain techniques 2 and 3
|
|
#Stemming the words and removing stop words.
|
|
filtered_words = [
|
|
ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words
|
|
]
|
|
|
|
#Add the information to the index
|
|
indexed_page = {
|
|
"url": webpage_url,
|
|
"title": title,
|
|
"description": description,
|
|
"words": filtered_words
|
|
}
|
|
#If you want to print the results
|
|
#print(f"Indexed: {webpage_url}. \n Here's the info: \n title: {title} \n description: {description} \n number of words: {len(filtered_words)} \n")
|
|
return indexed_page
|