adiciona o buscador e cria views e templates para ele

This commit is contained in:
root
2025-03-07 15:31:53 +01:00
parent 1cd93f7955
commit 3f5ac79051
18 changed files with 378 additions and 235 deletions

View File

@ -1,9 +1,11 @@
import json
import debugpy
from django.shortcuts import render
from elasticsearch_dsl import Search, Q
from elasticsearch_dsl.connections import connections
from django.conf import settings
import re
from .documents import PDFDocument
from django.http import JsonResponse
@ -12,224 +14,295 @@ connections.create_connection(hosts=[settings.ELASTICSEARCH_HOSTS])
def spellcheck_view(request):
query = request.GET.get('q', '')
query = request.GET.get("q", "")
suggestions = []
if query:
s = Search(index='pdf_documents')
s = s.suggest('auto_correct', query,
phrase={
'field': 'suggest',
'size': 3,
'gram_size': 3,
'confidence': 2.0,
'direct_generator': [{
'field': 'suggest',
'suggest_mode': 'popular'
}]
})
s = Search(index="pdf_documents")
s = s.suggest(
"auto_correct",
query,
phrase={
"field": "suggest",
"size": 3,
"gram_size": 3,
"confidence": 2.0,
"direct_generator": [{"field": "suggest", "suggest_mode": "popular"}],
},
)
response = s.execute()
if hasattr(response.suggest, 'auto_correct'):
if hasattr(response.suggest, "auto_correct"):
for option in response.suggest.auto_correct[0].options:
suggestions.append(option.text)
return JsonResponse({'suggestions': suggestions})
return JsonResponse({"suggestions": suggestions})
def search_view(request):
query = request.GET.get('q', '') # Obtém o termo de pesquisa da URL
page = int(request.GET.get('page', 1))
query = request.GET.get("q", "") # Obtém o termo de pesquisa da URL
page = request.GET.get("page", 1) # Obtém o valor de "page" (padrão: 1)
# Converte page para int
try:
page = int(page)
except ValueError:
page = 1 # Valor padrão em caso de erro
results = []
suggestions = []
spelling_correction = None
total_hits = 0
per_page = 10
if query:
# Processamento especial para termos entre aspas
exact_phrases = re.findall(r'"([^"]*)"', query)
# Remove os termos entre aspas da consulta principal
cleaned_query = query
for phrase in exact_phrases:
cleaned_query = cleaned_query.replace(f'"{phrase}"', '')
cleaned_query = cleaned_query.replace(f'"{phrase}"', "")
# Remove espaços extras e pontuação desnecessária
cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
cleaned_query = re.sub(r"\s+", " ", cleaned_query).strip()
# Cria uma consulta no Elasticsearch
search = Search(index='pdf_documents')
search = Search(index="pdf_documents")
# Lista para armazenar todas as consultas
queries = []
# Adiciona consulta para termos gerais (com fuzziness para tolerância a erros)
if cleaned_query:
queries.append(
Q('multi_match',
query=cleaned_query,
fields=['title^3', 'content^2', 'synonyms^1'],
fuzziness='AUTO',
boost=2,)
)
queries.append(
Q('match',
synonyms={
'query': cleaned_query,
'boost': 0.5
})
)
Q(
"multi_match",
query=cleaned_query,
fields=["title^3", "content^2", "synonyms^1"],
fuzziness="AUTO",
boost=2,
)
)
queries.append(Q("match", synonyms={"query": cleaned_query, "boost": 0.5}))
# Adiciona consultas exatas para frases entre aspas (sem fuzziness)
for phrase in exact_phrases:
if phrase.strip():
# Consulta de frase exata para o título com peso alto
queries.append(
Q('match_phrase',
title={
'query': phrase,
'boost': 3,
'slop': 0 # Sem flexibilidade na ordem das palavras
})
Q(
"match_phrase",
title={
"query": phrase,
"boost": 3,
"slop": 0, # Sem flexibilidade na ordem das palavras
},
)
)
# Consulta de frase exata para o conteúdo com peso médio
queries.append(
Q('match_phrase',
content={
'query': phrase,
'boost': 2,
'slop': 0 # Sem flexibilidade na ordem das palavras
})
Q(
"match_phrase",
content={
"query": phrase,
"boost": 2,
"slop": 0, # Sem flexibilidade na ordem das palavras
},
)
)
# Combina as consultas com OR (se houver alguma)
if queries:
search = search.query(
Q('bool', should=queries, minimum_should_match=1)
)
search = search.query(Q("bool", should=queries, minimum_should_match=1))
# Configuração do highlight para mostrar mais contexto
search = search.highlight('content', fragment_size=300, number_of_fragments=2, pre_tags=['<mark>'], post_tags=['</mark>'])
search = search.highlight('title', fragment_size=300, number_of_fragments=1, pre_tags=['<mark>'], post_tags=['</mark>'])
search = search.highlight(
"content",
fragment_size=300,
number_of_fragments=2,
pre_tags=["<mark>"],
post_tags=["</mark>"],
)
search = search.highlight(
"title",
fragment_size=300,
number_of_fragments=1,
pre_tags=["<mark>"],
post_tags=["</mark>"],
)
# Paginação
search = search[(page-1)*per_page:page*per_page]
search = search[(page - 1) * per_page : page * per_page]
# Executa a consulta
response = search.execute()
total_hits = response.hits.total.value
# Processa os resultados
for hit in response:
# Obter o objeto PDFDocument correspondente
try:
pdf_doc = PDFDocument.objects.get(id=hit.meta.id)
pdf_url = pdf_doc.file.url # URL do PDF
matching_pages = []
if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
for highlight in hit.meta.highlight['pages.content']:
page_matches = re.findall(r'page_(\d+)', highlight)
if page_matches:
matching_pages.append(int(page_matches[0]))
if not matching_pages and query:
if pdf_doc.page_content:
try:
page_data = json.loads(pdf_doc.page_content)
for page_d in page_data:
if query.lower() in page_d['content'].lower():
matching_pages.append(page_d['number'])
except json.JSONDecodeError as e:
logger.error(f"Erro ao decodificar JSON para o documento {pdf_doc.id}: {e}")
page_data = []
else:
page_data = []
matching_pages = sorted(list(set(matching_pages)))
except PDFDocument.DoesNotExist:
pdf_url = ""
matching_pages = []
# Extrai o conteúdo destacado ou usa o original
if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'content'):
highlighted_content = ' ... '.join(hit.meta.highlight.content)
if hasattr(hit.meta, "highlight") and hasattr(
hit.meta.highlight, "content"
):
highlighted_content = " ... ".join(hit.meta.highlight.content)
else:
# Se não houver highlight, pegue os primeiros 300 caracteres
highlighted_content = hit.content[:300] + '...' if len(hit.content) > 300 else hit.content
highlighted_content = (
hit.content[:300] + "..."
if len(hit.content) > 300
else hit.content
)
# Extrai o título destacado ou usa o original
if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'):
if hasattr(hit.meta, "highlight") and hasattr(
hit.meta.highlight, "title"
):
highlighted_title = hit.meta.highlight.title[0]
else:
highlighted_title = hit.title
# Verifica se o resultado corresponde a uma frase exata
is_exact_match = any(phrase.lower() in hit.content.lower() or
phrase.lower() in hit.title.lower()
for phrase in exact_phrases)
results.append({
'id': hit.meta.id,
'title': hit.title,
'highlighted_title': highlighted_title,
'highlighted_content': highlighted_content,
'uploaded_at': hit.uploaded_at,
'score': hit.meta.score,
'is_exact_match': is_exact_match
})
# Sugestões "Você quis dizer" (apenas para termos fora de aspas)
if total_hits < 5 and cleaned_query:
suggestion_search = Search(index='pdf_documents')
suggestion_search = suggestion_search.suggest(
'term_suggestion',
cleaned_query,
term={
'field': 'content',
'suggest_mode': 'popular',
'size': 5
is_exact_match = any(
phrase.lower() in hit.content.lower()
or phrase.lower() in hit.title.lower()
for phrase in exact_phrases
)
results.append(
{
"id": hit.meta.id,
"title": hit.title,
"highlighted_title": highlighted_title,
"highlighted_content": highlighted_content,
"uploaded_at": hit.uploaded_at,
"score": hit.meta.score,
"is_exact_match": is_exact_match,
"pdf_url": pdf_url,
"matching_pages": matching_pages,
}
)
# Sugestões "Você quis dizer" (apenas para termos fora de aspas)
if total_hits < 5 and cleaned_query:
suggestion_search = Search(index="pdf_documents")
suggestion_search = suggestion_search.suggest(
"term_suggestion",
cleaned_query,
term={"field": "content", "suggest_mode": "popular", "size": 5},
)
suggestion_response = suggestion_search.execute()
if hasattr(suggestion_response, 'suggest') and hasattr(suggestion_response.suggest, 'term_suggestion'):
if hasattr(suggestion_response, "suggest") and hasattr(
suggestion_response.suggest, "term_suggestion"
):
for suggestion in suggestion_response.suggest.term_suggestion:
for option in suggestion.options:
suggestions.append(option.text)
# Cria uma correção ortográfica se necessário
if suggestions and total_hits == 0:
corrected_query = cleaned_query
for suggestion_term in suggestion_response.suggest.term_suggestion:
for (
suggestion_term
) in suggestion_response.suggest.term_suggestion:
if suggestion_term.options:
# Substitui palavras incorretas por sugestões
word_to_replace = suggestion_term.text
corrected_word = suggestion_term.options[0].text
corrected_query = re.sub(r'\b' + re.escape(word_to_replace) + r'\b',
corrected_word,
corrected_query,
flags=re.IGNORECASE)
corrected_query = re.sub(
r"\b" + re.escape(word_to_replace) + r"\b",
corrected_word,
corrected_query,
flags=re.IGNORECASE,
)
# Reconstrói a consulta original mantendo as frases entre aspas
if corrected_query != cleaned_query:
spelling_correction = corrected_query
for phrase in exact_phrases:
spelling_correction += f' "{phrase}"'
spelling_correction = spelling_correction.strip()
# Busca por termos relacionados (apenas se houver poucos resultados)
if total_hits < 3 and cleaned_query:
related_terms = Search(index='pdf_documents')
related_terms = Search(index="pdf_documents")
related_terms = related_terms.query(
'more_like_this',
fields=['content', 'title'],
"more_like_this",
fields=["content", "title"],
like=cleaned_query,
min_term_freq=1,
max_query_terms=10,
min_doc_freq=1
min_doc_freq=1,
)
related_terms = related_terms[:5]
related_response = related_terms.execute()
for hit in related_response:
# Verifica se este documento já está nos resultados
if not any(r.get('id') == hit.meta.id for r in results):
results.append({
'id': hit.meta.id,
'title': hit.title,
'highlighted_title': hit.title,
'highlighted_content': hit.content[:300] + '...' if len(hit.content) > 300 else hit.content,
'uploaded_at': hit.uploaded_at,
'score': hit.meta.score,
'is_related': True
})
if not any(r.get("id") == hit.meta.id for r in results):
results.append(
{
"id": hit.meta.id,
"title": hit.title,
"highlighted_title": hit.title,
"highlighted_content": (
hit.content[:300] + "..."
if len(hit.content) > 300
else hit.content
),
"uploaded_at": hit.uploaded_at,
"score": hit.meta.score,
"is_related": True,
"pdf_url": pdf_url,
}
)
# Calcula a paginação
total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0
# Renderiza o template com os resultados
return render(request, 'diarios/search_results.html', {
'query': query,
'results': results,
'suggestions': suggestions[:5], # Limita a 5 sugestões
'spelling_correction': spelling_correction,
'total_hits': total_hits,
'page': page,
'total_pages': total_pages,
'page_range': range(max(1, page-2), min(total_pages+1, page+3)),
'has_exact_phrases': bool(exact_phrases)
})
# Renderiza o template com os resultados
return render(
request,
"diarios/search_results.html",
{
"query": query,
"results": results,
"suggestions": suggestions[:5], # Limita a 5 sugestões
"spelling_correction": spelling_correction,
"total_hits": total_hits,
"page": page,
"total_pages": total_pages,
"page_range": range(max(1, page - 2), min(total_pages + 1, page + 3)),
"has_exact_phrases": bool(exact_phrases),
},
)