Search/search/indexing/advanced_indexing.py

import nltk
import ssl
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
nltk.download('punkt_tab')
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download NLTK data only if not already downloaded
def download_nltk_resources():
    try:
        stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
    try:
        word_tokenize('test')
    except LookupError:
        nltk.download('punkt')
#Function that indexes the webpage
def advanced_index_page(webpage, webpage_url):
    #Download NLTK data only if not already downloaded
    download_nltk_resources()

    # Initialize NLTK components
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    #Collect title and description
    title_tag = webpage.find('title')
    title = title_tag.get_text().strip() if title_tag else 'No Title'

    #Collect description
    description = ''
    meta_description = webpage.find('meta', attrs={'name': 'description'})
    if meta_description and 'content' in meta_description.attrs:
        description = meta_description['content']
    else:
        text_content = webpage.get_text(separator=" ", strip=True)
        description = text_content[:200] + "..." if len(text_content) > 200 else text_content


    # Grab ALL the words in the page.
    text_content = webpage.get_text(separator=' ', strip=True)
    #Splitting them into the individual words
    tokens = word_tokenize(text_content.lower())
    #Big brain techniques 2 and 3
    #Stemming the words and removing stop words.
    filtered_words = [
        ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words
    ]

    #Add the information to the index
    indexed_page = {
        "url": webpage_url,
        "title": title,
        "description": description,
        "words": filtered_words
    }
    #If you want to print the results
    #print(f"Indexed: {webpage_url}. \n Here's the info:  \n title: {title} \n description: {description} \n number of words: {len(filtered_words)} \n")
    return indexed_page