2025-03-07 15:31:53 +01:00
|
|
|
import json
|
|
|
|
|
import debugpy
|
2025-03-07 07:10:13 +01:00
|
|
|
from django.shortcuts import render
|
|
|
|
|
from elasticsearch_dsl import Search, Q
|
|
|
|
|
from elasticsearch_dsl.connections import connections
|
|
|
|
|
from django.conf import settings
|
|
|
|
|
import re
|
2025-03-07 15:31:53 +01:00
|
|
|
from .documents import PDFDocument
|
2025-03-07 07:10:13 +01:00
|
|
|
from django.http import JsonResponse
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Configuração da conexão com o Elasticsearch
|
|
|
|
|
connections.create_connection(hosts=[settings.ELASTICSEARCH_HOSTS])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def spellcheck_view(request):
|
2025-03-07 15:31:53 +01:00
|
|
|
query = request.GET.get("q", "")
|
2025-03-07 07:10:13 +01:00
|
|
|
suggestions = []
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
if query:
|
2025-03-07 15:31:53 +01:00
|
|
|
s = Search(index="pdf_documents")
|
|
|
|
|
s = s.suggest(
|
|
|
|
|
"auto_correct",
|
|
|
|
|
query,
|
|
|
|
|
phrase={
|
|
|
|
|
"field": "suggest",
|
|
|
|
|
"size": 3,
|
|
|
|
|
"gram_size": 3,
|
|
|
|
|
"confidence": 2.0,
|
|
|
|
|
"direct_generator": [{"field": "suggest", "suggest_mode": "popular"}],
|
|
|
|
|
},
|
|
|
|
|
)
|
2025-03-07 07:10:13 +01:00
|
|
|
response = s.execute()
|
2025-03-07 15:31:53 +01:00
|
|
|
|
|
|
|
|
if hasattr(response.suggest, "auto_correct"):
|
2025-03-07 07:10:13 +01:00
|
|
|
for option in response.suggest.auto_correct[0].options:
|
|
|
|
|
suggestions.append(option.text)
|
2025-03-07 15:31:53 +01:00
|
|
|
|
|
|
|
|
return JsonResponse({"suggestions": suggestions})
|
2025-03-07 07:10:13 +01:00
|
|
|
|
|
|
|
|
def search_view(request):
|
2025-03-07 15:31:53 +01:00
|
|
|
query = request.GET.get("q", "") # Obtém o termo de pesquisa da URL
|
|
|
|
|
page = request.GET.get("page", 1) # Obtém o valor de "page" (padrão: 1)
|
|
|
|
|
|
|
|
|
|
# Converte page para int
|
|
|
|
|
try:
|
|
|
|
|
page = int(page)
|
|
|
|
|
except ValueError:
|
|
|
|
|
page = 1 # Valor padrão em caso de erro
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
results = []
|
|
|
|
|
suggestions = []
|
|
|
|
|
spelling_correction = None
|
|
|
|
|
total_hits = 0
|
|
|
|
|
per_page = 10
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
if query:
|
|
|
|
|
# Processamento especial para termos entre aspas
|
|
|
|
|
exact_phrases = re.findall(r'"([^"]*)"', query)
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Remove os termos entre aspas da consulta principal
|
|
|
|
|
cleaned_query = query
|
|
|
|
|
for phrase in exact_phrases:
|
2025-03-07 15:31:53 +01:00
|
|
|
cleaned_query = cleaned_query.replace(f'"{phrase}"', "")
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Remove espaços extras e pontuação desnecessária
|
2025-03-07 15:31:53 +01:00
|
|
|
cleaned_query = re.sub(r"\s+", " ", cleaned_query).strip()
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Cria uma consulta no Elasticsearch
|
2025-03-07 15:31:53 +01:00
|
|
|
search = Search(index="pdf_documents")
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Lista para armazenar todas as consultas
|
|
|
|
|
queries = []
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Adiciona consulta para termos gerais (com fuzziness para tolerância a erros)
|
|
|
|
|
if cleaned_query:
|
|
|
|
|
queries.append(
|
2025-03-07 15:31:53 +01:00
|
|
|
Q(
|
|
|
|
|
"multi_match",
|
|
|
|
|
query=cleaned_query,
|
|
|
|
|
fields=["title^3", "content^2", "synonyms^1"],
|
|
|
|
|
fuzziness="AUTO",
|
|
|
|
|
boost=2,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
queries.append(Q("match", synonyms={"query": cleaned_query, "boost": 0.5}))
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Adiciona consultas exatas para frases entre aspas (sem fuzziness)
|
|
|
|
|
for phrase in exact_phrases:
|
|
|
|
|
if phrase.strip():
|
|
|
|
|
# Consulta de frase exata para o título com peso alto
|
|
|
|
|
queries.append(
|
2025-03-07 15:31:53 +01:00
|
|
|
Q(
|
|
|
|
|
"match_phrase",
|
|
|
|
|
title={
|
|
|
|
|
"query": phrase,
|
|
|
|
|
"boost": 3,
|
|
|
|
|
"slop": 0, # Sem flexibilidade na ordem das palavras
|
|
|
|
|
},
|
|
|
|
|
)
|
2025-03-07 07:10:13 +01:00
|
|
|
)
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Consulta de frase exata para o conteúdo com peso médio
|
|
|
|
|
queries.append(
|
2025-03-07 15:31:53 +01:00
|
|
|
Q(
|
|
|
|
|
"match_phrase",
|
|
|
|
|
content={
|
|
|
|
|
"query": phrase,
|
|
|
|
|
"boost": 2,
|
|
|
|
|
"slop": 0, # Sem flexibilidade na ordem das palavras
|
|
|
|
|
},
|
|
|
|
|
)
|
2025-03-07 07:10:13 +01:00
|
|
|
)
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Combina as consultas com OR (se houver alguma)
|
|
|
|
|
if queries:
|
2025-03-07 15:31:53 +01:00
|
|
|
search = search.query(Q("bool", should=queries, minimum_should_match=1))
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Configuração do highlight para mostrar mais contexto
|
2025-03-07 15:31:53 +01:00
|
|
|
search = search.highlight(
|
|
|
|
|
"content",
|
|
|
|
|
fragment_size=300,
|
|
|
|
|
number_of_fragments=2,
|
|
|
|
|
pre_tags=["<mark>"],
|
|
|
|
|
post_tags=["</mark>"],
|
|
|
|
|
)
|
|
|
|
|
search = search.highlight(
|
|
|
|
|
"title",
|
|
|
|
|
fragment_size=300,
|
|
|
|
|
number_of_fragments=1,
|
|
|
|
|
pre_tags=["<mark>"],
|
|
|
|
|
post_tags=["</mark>"],
|
|
|
|
|
)
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Paginação
|
2025-03-07 15:31:53 +01:00
|
|
|
search = search[(page - 1) * per_page : page * per_page]
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Executa a consulta
|
|
|
|
|
response = search.execute()
|
|
|
|
|
total_hits = response.hits.total.value
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Processa os resultados
|
|
|
|
|
for hit in response:
|
2025-03-07 15:31:53 +01:00
|
|
|
# Obter o objeto PDFDocument correspondente
|
|
|
|
|
try:
|
|
|
|
|
pdf_doc = PDFDocument.objects.get(id=hit.meta.id)
|
|
|
|
|
pdf_url = pdf_doc.file.url # URL do PDF
|
|
|
|
|
|
|
|
|
|
matching_pages = []
|
|
|
|
|
if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
|
|
|
|
|
for highlight in hit.meta.highlight['pages.content']:
|
|
|
|
|
page_matches = re.findall(r'page_(\d+)', highlight)
|
|
|
|
|
if page_matches:
|
|
|
|
|
matching_pages.append(int(page_matches[0]))
|
|
|
|
|
if not matching_pages and query:
|
|
|
|
|
if pdf_doc.page_content:
|
|
|
|
|
try:
|
|
|
|
|
page_data = json.loads(pdf_doc.page_content)
|
|
|
|
|
for page_d in page_data:
|
|
|
|
|
if query.lower() in page_d['content'].lower():
|
|
|
|
|
matching_pages.append(page_d['number'])
|
|
|
|
|
except json.JSONDecodeError as e:
|
|
|
|
|
logger.error(f"Erro ao decodificar JSON para o documento {pdf_doc.id}: {e}")
|
|
|
|
|
page_data = []
|
|
|
|
|
else:
|
|
|
|
|
page_data = []
|
|
|
|
|
|
|
|
|
|
matching_pages = sorted(list(set(matching_pages)))
|
|
|
|
|
except PDFDocument.DoesNotExist:
|
|
|
|
|
pdf_url = ""
|
|
|
|
|
matching_pages = []
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Extrai o conteúdo destacado ou usa o original
|
2025-03-07 15:31:53 +01:00
|
|
|
if hasattr(hit.meta, "highlight") and hasattr(
|
|
|
|
|
hit.meta.highlight, "content"
|
|
|
|
|
):
|
|
|
|
|
highlighted_content = " ... ".join(hit.meta.highlight.content)
|
2025-03-07 07:10:13 +01:00
|
|
|
else:
|
|
|
|
|
# Se não houver highlight, pegue os primeiros 300 caracteres
|
2025-03-07 15:31:53 +01:00
|
|
|
highlighted_content = (
|
|
|
|
|
hit.content[:300] + "..."
|
|
|
|
|
if len(hit.content) > 300
|
|
|
|
|
else hit.content
|
|
|
|
|
)
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Extrai o título destacado ou usa o original
|
2025-03-07 15:31:53 +01:00
|
|
|
if hasattr(hit.meta, "highlight") and hasattr(
|
|
|
|
|
hit.meta.highlight, "title"
|
|
|
|
|
):
|
2025-03-07 07:10:13 +01:00
|
|
|
highlighted_title = hit.meta.highlight.title[0]
|
|
|
|
|
else:
|
|
|
|
|
highlighted_title = hit.title
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Verifica se o resultado corresponde a uma frase exata
|
2025-03-07 15:31:53 +01:00
|
|
|
is_exact_match = any(
|
|
|
|
|
phrase.lower() in hit.content.lower()
|
|
|
|
|
or phrase.lower() in hit.title.lower()
|
|
|
|
|
for phrase in exact_phrases
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results.append(
|
|
|
|
|
{
|
|
|
|
|
"id": hit.meta.id,
|
|
|
|
|
"title": hit.title,
|
|
|
|
|
"highlighted_title": highlighted_title,
|
|
|
|
|
"highlighted_content": highlighted_content,
|
|
|
|
|
"uploaded_at": hit.uploaded_at,
|
|
|
|
|
"score": hit.meta.score,
|
|
|
|
|
"is_exact_match": is_exact_match,
|
|
|
|
|
"pdf_url": pdf_url,
|
|
|
|
|
"matching_pages": matching_pages,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Sugestões "Você quis dizer" (apenas para termos fora de aspas)
|
|
|
|
|
if total_hits < 5 and cleaned_query:
|
2025-03-07 15:31:53 +01:00
|
|
|
suggestion_search = Search(index="pdf_documents")
|
2025-03-07 07:10:13 +01:00
|
|
|
suggestion_search = suggestion_search.suggest(
|
2025-03-07 15:31:53 +01:00
|
|
|
"term_suggestion",
|
2025-03-07 07:10:13 +01:00
|
|
|
cleaned_query,
|
2025-03-07 15:31:53 +01:00
|
|
|
term={"field": "content", "suggest_mode": "popular", "size": 5},
|
2025-03-07 07:10:13 +01:00
|
|
|
)
|
|
|
|
|
suggestion_response = suggestion_search.execute()
|
2025-03-07 15:31:53 +01:00
|
|
|
|
|
|
|
|
if hasattr(suggestion_response, "suggest") and hasattr(
|
|
|
|
|
suggestion_response.suggest, "term_suggestion"
|
|
|
|
|
):
|
2025-03-07 07:10:13 +01:00
|
|
|
for suggestion in suggestion_response.suggest.term_suggestion:
|
|
|
|
|
for option in suggestion.options:
|
|
|
|
|
suggestions.append(option.text)
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Cria uma correção ortográfica se necessário
|
|
|
|
|
if suggestions and total_hits == 0:
|
|
|
|
|
corrected_query = cleaned_query
|
2025-03-07 15:31:53 +01:00
|
|
|
for (
|
|
|
|
|
suggestion_term
|
|
|
|
|
) in suggestion_response.suggest.term_suggestion:
|
2025-03-07 07:10:13 +01:00
|
|
|
if suggestion_term.options:
|
|
|
|
|
# Substitui palavras incorretas por sugestões
|
|
|
|
|
word_to_replace = suggestion_term.text
|
|
|
|
|
corrected_word = suggestion_term.options[0].text
|
2025-03-07 15:31:53 +01:00
|
|
|
corrected_query = re.sub(
|
|
|
|
|
r"\b" + re.escape(word_to_replace) + r"\b",
|
|
|
|
|
corrected_word,
|
|
|
|
|
corrected_query,
|
|
|
|
|
flags=re.IGNORECASE,
|
|
|
|
|
)
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Reconstrói a consulta original mantendo as frases entre aspas
|
|
|
|
|
if corrected_query != cleaned_query:
|
|
|
|
|
spelling_correction = corrected_query
|
|
|
|
|
for phrase in exact_phrases:
|
|
|
|
|
spelling_correction += f' "{phrase}"'
|
|
|
|
|
spelling_correction = spelling_correction.strip()
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Busca por termos relacionados (apenas se houver poucos resultados)
|
|
|
|
|
if total_hits < 3 and cleaned_query:
|
2025-03-07 15:31:53 +01:00
|
|
|
related_terms = Search(index="pdf_documents")
|
2025-03-07 07:10:13 +01:00
|
|
|
related_terms = related_terms.query(
|
2025-03-07 15:31:53 +01:00
|
|
|
"more_like_this",
|
|
|
|
|
fields=["content", "title"],
|
2025-03-07 07:10:13 +01:00
|
|
|
like=cleaned_query,
|
|
|
|
|
min_term_freq=1,
|
|
|
|
|
max_query_terms=10,
|
2025-03-07 15:31:53 +01:00
|
|
|
min_doc_freq=1,
|
2025-03-07 07:10:13 +01:00
|
|
|
)
|
|
|
|
|
related_terms = related_terms[:5]
|
|
|
|
|
related_response = related_terms.execute()
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
for hit in related_response:
|
|
|
|
|
# Verifica se este documento já está nos resultados
|
2025-03-07 15:31:53 +01:00
|
|
|
if not any(r.get("id") == hit.meta.id for r in results):
|
|
|
|
|
results.append(
|
|
|
|
|
{
|
|
|
|
|
"id": hit.meta.id,
|
|
|
|
|
"title": hit.title,
|
|
|
|
|
"highlighted_title": hit.title,
|
|
|
|
|
"highlighted_content": (
|
|
|
|
|
hit.content[:300] + "..."
|
|
|
|
|
if len(hit.content) > 300
|
|
|
|
|
else hit.content
|
|
|
|
|
),
|
|
|
|
|
"uploaded_at": hit.uploaded_at,
|
|
|
|
|
"score": hit.meta.score,
|
|
|
|
|
"is_related": True,
|
|
|
|
|
"pdf_url": pdf_url,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
# Calcula a paginação
|
|
|
|
|
total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0
|
|
|
|
|
|
2025-03-07 15:31:53 +01:00
|
|
|
# Renderiza o template com os resultados
|
|
|
|
|
return render(
|
|
|
|
|
request,
|
|
|
|
|
"diarios/search_results.html",
|
|
|
|
|
{
|
|
|
|
|
"query": query,
|
|
|
|
|
"results": results,
|
|
|
|
|
"suggestions": suggestions[:5], # Limita a 5 sugestões
|
|
|
|
|
"spelling_correction": spelling_correction,
|
|
|
|
|
"total_hits": total_hits,
|
|
|
|
|
"page": page,
|
|
|
|
|
"total_pages": total_pages,
|
|
|
|
|
"page_range": range(max(1, page - 2), min(total_pages + 1, page + 3)),
|
|
|
|
|
"has_exact_phrases": bool(exact_phrases),
|
|
|
|
|
},
|
|
|
|
|
)
|