Search/search/indexing/simple_indexing.py
The Coding Sloth 7771dffa6e first commit
2024-11-07 17:30:24 -05:00

34 lines
No EOL
1.3 KiB
Python

import re
def simple_index_page(webpage, webpage_url):
#Collect title and description
title_tag = webpage.find('title')
title = title_tag.get_text().strip() if title_tag else 'No Title'
#Collect description
description = ''
meta_description = webpage.find('meta', attrs={'name': 'description'})
if meta_description and 'content' in meta_description.attrs:
description = meta_description['content']
else:
text_content = webpage.get_text(separator=" ", strip=True)
description = text_content[:200] + "..." if len(text_content) > 200 else text_content
#Grab ALL the words in the page
#regex disgusting...
words = re.findall(r'\b\w+\b', webpage.get_text(separator=" ", strip=True).lower())
#Double check and filter out any numbers, symbols, etc.
#WE ONLY WANT WORDS
words = [word for word in words if word.isalpha()]
#Add the information to the index
indexed_page = {
"url": webpage_url,
"title": title,
"description": description,
"words": words
}
print(f"Indexed: {webpage_url}. \n Here's the info: \n title: {title} \n description: {description} \n number of words: {len(words)} \n")
return indexed_page