modifica a view de pesquisa, o modelo de diario oficial e o documento do diario oficial

This commit is contained in:
root
2025-03-07 16:32:10 +01:00
parent 6471ee6152
commit 8d1f6feeaf
4 changed files with 203 additions and 204 deletions

View File

@ -65,3 +65,93 @@ class PDFDocumentDocument(Document):
class Django: class Django:
model = PDFDocument model = PDFDocument
fields = ["uploaded_at", "file"] fields = ["uploaded_at", "file"]
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from .models import DiarioOficial
@registry.register_document
class DiarioOficialDocument(Document):
# Campos principais
title = fields.TextField()
tipo = fields.KeywordField()
# Campo para arquivo PDF (se aplicável)
arquivo = fields.TextField(attr="arquivo.url")
# Nested field para páginas (usando o page_content)
pages = fields.NestedField(
properties={
"number": fields.IntegerField(),
"content": fields.TextField(analyzer="portuguese")
}
)
class Index:
name = "diarios_oficiais"
settings = {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"portuguese": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"portuguese_synonyms",
]
},
"portuguese_search": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"suggest_shingle",
]
}
},
"filter": {
"suggest_shingle": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3
},
"stop": {"type": "stop", "stopwords": "_portuguese_"},
"ascii_folding": {"type": "asciifolding"},
"portuguese_stemmer": {"type": "stemmer", "language": "portuguese"},
"portuguese_synonyms": {
"type": "synonym",
"synonyms_path": "synonyms.txt",
"expand": True
}
}
}
}
class Django:
model = DiarioOficial
fields = [
"data",
"numero",
"link",
]
def prepare_tipo(self, instance):
return instance.tipo.nome if instance.tipo else None
def prepare_title(self, instance):
return f"{instance.tipo.nome if instance.tipo else 'Diário'} {instance.numero}"
def prepare_pages(self, instance):
# Prepara o campo pages usando o page_content
if instance.page_content:
return instance.page_content # page_content já é uma lista de dicionários
return []

View File

@ -0,0 +1,39 @@
# Generated by Django 5.0.12 on 2025-03-07 15:25
import django.core.serializers.json
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("diarios", "0003_tipodiariooficial_diariooficial_and_more"),
]
operations = [
migrations.RemoveField(
model_name="diariooficial",
name="finalizado",
),
migrations.AddField(
model_name="diariooficial",
name="page_content",
field=models.JSONField(
blank=True,
encoder=django.core.serializers.json.DjangoJSONEncoder,
null=True,
),
),
migrations.AlterField(
model_name="diariooficial",
name="tipo",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="diarios",
to="diarios.tipodiariooficial",
),
),
]

View File

@ -1,6 +1,7 @@
from django.db import models from django.db import models
import PyPDF2 import PyPDF2
import json import json
from django.core.serializers.json import DjangoJSONEncoder
class PDFDocument(models.Model): class PDFDocument(models.Model):
@ -50,12 +51,29 @@ class DiarioOficial(models.Model):
TipoDiarioOficial, TipoDiarioOficial,
blank=True, blank=True,
null=True, null=True,
on_delete=models.CASCADE, on_delete=models.SET_NULL,
related_name="diarios", related_name="diarios",
) )
numero = models.CharField(max_length=20, unique=True) numero = models.CharField(max_length=20, unique=True)
link = models.URLField(blank=True, null=True, unique=True) link = models.URLField(blank=True, null=True, unique=True)
finalizado = models.BooleanField(default=False) page_content = models.JSONField(encoder=DjangoJSONEncoder, blank=True, null=True)
def save(self, *args, **kwargs):
if self.file:
pdf = PyPDF2.PdfReader(self.file)
pages_data = []
for i, pagina in enumerate(pdf.pages):
page_text = pagina.extract_text()
pages_data.append(
{
"number": i + 1,
"content": page_text,
}
)
self.page_content = json.dumps(pages_data)
super().save(*args, **kwargs)
@property @property
def data_formatada(self): def data_formatada(self):
@ -71,3 +89,4 @@ class DiarioOficial(models.Model):
class Meta: class Meta:
constraints = [models.UniqueConstraint(fields=["numero"], name="unique_numero")] constraints = [models.UniqueConstraint(fields=["numero"], name="unique_numero")]
verbose_name_plural = "Diários Oficiais" verbose_name_plural = "Diários Oficiais"

View File

@ -39,14 +39,8 @@ def spellcheck_view(request):
return JsonResponse({"suggestions": suggestions}) return JsonResponse({"suggestions": suggestions})
def search_view(request): def search_view(request):
query = request.GET.get("q", "") # Obtém o termo de pesquisa da URL query = request.GET.get('q', '') # Obtém o termo de pesquisa da URL
page = request.GET.get("page", 1) # Obtém o valor de "page" (padrão: 1) page = int(request.GET.get('page', 1))
# Converte page para int
try:
page = int(page)
except ValueError:
page = 1 # Valor padrão em caso de erro
results = [] results = []
suggestions = [] suggestions = []
@ -61,13 +55,13 @@ def search_view(request):
# Remove os termos entre aspas da consulta principal # Remove os termos entre aspas da consulta principal
cleaned_query = query cleaned_query = query
for phrase in exact_phrases: for phrase in exact_phrases:
cleaned_query = cleaned_query.replace(f'"{phrase}"', "") cleaned_query = cleaned_query.replace(f'"{phrase}"', '')
# Remove espaços extras e pontuação desnecessária # Remove espaços extras e pontuação desnecessária
cleaned_query = re.sub(r"\s+", " ", cleaned_query).strip() cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
# Cria uma consulta no Elasticsearch # Cria uma consulta no Elasticsearch
search = Search(index="pdf_documents") search = Search(index='diarios_oficiais')
# Lista para armazenar todas as consultas # Lista para armazenar todas as consultas
queries = [] queries = []
@ -75,61 +69,38 @@ def search_view(request):
# Adiciona consulta para termos gerais (com fuzziness para tolerância a erros) # Adiciona consulta para termos gerais (com fuzziness para tolerância a erros)
if cleaned_query: if cleaned_query:
queries.append( queries.append(
Q( Q('multi_match',
"multi_match",
query=cleaned_query, query=cleaned_query,
fields=["title^3", "content^2", "synonyms^1"], fields=['title^3', 'pages.content^2'],
fuzziness="AUTO", fuzziness='AUTO',
boost=2, boost=2)
) )
)
queries.append(Q("match", synonyms={"query": cleaned_query, "boost": 0.5}))
# Adiciona consultas exatas para frases entre aspas (sem fuzziness) # Adiciona consultas exatas para frases entre aspas (sem fuzziness)
for phrase in exact_phrases: for phrase in exact_phrases:
if phrase.strip(): if phrase.strip():
# Consulta de frase exata para o título com peso alto
queries.append( queries.append(
Q( Q('match_phrase',
"match_phrase", pages__content={
title={ 'query': phrase,
"query": phrase, 'boost': 2,
"boost": 3, 'slop': 0 # Sem flexibilidade na ordem das palavras
"slop": 0, # Sem flexibilidade na ordem das palavras })
},
)
)
# Consulta de frase exata para o conteúdo com peso médio
queries.append(
Q(
"match_phrase",
content={
"query": phrase,
"boost": 2,
"slop": 0, # Sem flexibilidade na ordem das palavras
},
)
) )
# Combina as consultas com OR (se houver alguma) # Combina as consultas com OR (se houver alguma)
if queries: if queries:
search = search.query(Q("bool", should=queries, minimum_should_match=1)) search = search.query(
Q('bool', should=queries, minimum_should_match=1)
)
# Configuração do highlight para mostrar mais contexto # Configuração do highlight para mostrar mais contexto
search = search.highlight( search = search.highlight(
"content", 'pages.content',
fragment_size=300, fragment_size=300,
number_of_fragments=2, number_of_fragments=2,
pre_tags=["<mark>"], pre_tags=['<mark>'],
post_tags=["</mark>"], post_tags=['</mark>']
)
search = search.highlight(
"title",
fragment_size=300,
number_of_fragments=1,
pre_tags=["<mark>"],
post_tags=["</mark>"],
) )
# Paginação # Paginação
@ -141,168 +112,48 @@ def search_view(request):
# Processa os resultados # Processa os resultados
for hit in response: for hit in response:
# Obter o objeto PDFDocument correspondente
try:
pdf_doc = PDFDocument.objects.get(id=hit.meta.id)
pdf_url = pdf_doc.file.url # URL do PDF
matching_pages = []
if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
for highlight in hit.meta.highlight['pages.content']:
page_matches = re.findall(r'page_(\d+)', highlight)
if page_matches:
matching_pages.append(int(page_matches[0]))
if not matching_pages and query:
if pdf_doc.page_content:
try:
page_data = json.loads(pdf_doc.page_content)
for page_d in page_data:
if query.lower() in page_d['content'].lower():
matching_pages.append(page_d['number'])
except json.JSONDecodeError as e:
logger.error(f"Erro ao decodificar JSON para o documento {pdf_doc.id}: {e}")
page_data = []
else:
page_data = []
matching_pages = sorted(list(set(matching_pages)))
except PDFDocument.DoesNotExist:
pdf_url = ""
matching_pages = []
# Extrai o conteúdo destacado ou usa o original # Extrai o conteúdo destacado ou usa o original
if hasattr(hit.meta, "highlight") and hasattr( if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
hit.meta.highlight, "content" highlighted_content = ' ... '.join(hit.meta.highlight['pages.content'])
):
highlighted_content = " ... ".join(hit.meta.highlight.content)
else: else:
# Se não houver highlight, pegue os primeiros 300 caracteres highlighted_content = ""
highlighted_content = (
hit.content[:300] + "..."
if len(hit.content) > 300
else hit.content
)
# Extrai o título destacado ou usa o original # Extrai o título destacado ou usa o original
if hasattr(hit.meta, "highlight") and hasattr( if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'):
hit.meta.highlight, "title"
):
highlighted_title = hit.meta.highlight.title[0] highlighted_title = hit.meta.highlight.title[0]
else: else:
highlighted_title = hit.title highlighted_title = hit.title
# Verifica se o resultado corresponde a uma frase exata # Verifica se o resultado corresponde a uma frase exata
is_exact_match = any( is_exact_match = any(phrase.lower() in hit.pages.content.lower() or
phrase.lower() in hit.content.lower() phrase.lower() in hit.title.lower()
or phrase.lower() in hit.title.lower() for phrase in exact_phrases)
for phrase in exact_phrases
)
results.append( results.append({
{ 'id': hit.meta.id,
"id": hit.meta.id, 'title': hit.title,
"title": hit.title, 'highlighted_title': highlighted_title,
"highlighted_title": highlighted_title, 'highlighted_content': highlighted_content,
"highlighted_content": highlighted_content, 'data': hit.data,
"uploaded_at": hit.uploaded_at, 'numero': hit.numero,
"score": hit.meta.score, 'link': hit.link,
"is_exact_match": is_exact_match, 'finalizado': hit.finalizado,
"pdf_url": pdf_url, 'is_exact_match': is_exact_match
"matching_pages": matching_pages, })
}
)
# Sugestões "Você quis dizer" (apenas para termos fora de aspas)
if total_hits < 5 and cleaned_query:
suggestion_search = Search(index="pdf_documents")
suggestion_search = suggestion_search.suggest(
"term_suggestion",
cleaned_query,
term={"field": "content", "suggest_mode": "popular", "size": 5},
)
suggestion_response = suggestion_search.execute()
if hasattr(suggestion_response, "suggest") and hasattr(
suggestion_response.suggest, "term_suggestion"
):
for suggestion in suggestion_response.suggest.term_suggestion:
for option in suggestion.options:
suggestions.append(option.text)
# Cria uma correção ortográfica se necessário
if suggestions and total_hits == 0:
corrected_query = cleaned_query
for (
suggestion_term
) in suggestion_response.suggest.term_suggestion:
if suggestion_term.options:
# Substitui palavras incorretas por sugestões
word_to_replace = suggestion_term.text
corrected_word = suggestion_term.options[0].text
corrected_query = re.sub(
r"\b" + re.escape(word_to_replace) + r"\b",
corrected_word,
corrected_query,
flags=re.IGNORECASE,
)
# Reconstrói a consulta original mantendo as frases entre aspas
if corrected_query != cleaned_query:
spelling_correction = corrected_query
for phrase in exact_phrases:
spelling_correction += f' "{phrase}"'
spelling_correction = spelling_correction.strip()
# Busca por termos relacionados (apenas se houver poucos resultados)
if total_hits < 3 and cleaned_query:
related_terms = Search(index="pdf_documents")
related_terms = related_terms.query(
"more_like_this",
fields=["content", "title"],
like=cleaned_query,
min_term_freq=1,
max_query_terms=10,
min_doc_freq=1,
)
related_terms = related_terms[:5]
related_response = related_terms.execute()
for hit in related_response:
# Verifica se este documento já está nos resultados
if not any(r.get("id") == hit.meta.id for r in results):
results.append(
{
"id": hit.meta.id,
"title": hit.title,
"highlighted_title": hit.title,
"highlighted_content": (
hit.content[:300] + "..."
if len(hit.content) > 300
else hit.content
),
"uploaded_at": hit.uploaded_at,
"score": hit.meta.score,
"is_related": True,
"pdf_url": pdf_url,
}
)
# Calcula a paginação # Calcula a paginação
total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0 total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0
# Renderiza o template com os resultados # Renderiza o template com os resultados
return render( return render(request, 'diarios/search_results.html', {
request, 'query': query,
"diarios/search_results.html", 'results': results,
{ 'suggestions': suggestions[:5], # Limita a 5 sugestões
"query": query, 'spelling_correction': spelling_correction,
"results": results, 'total_hits': total_hits,
"suggestions": suggestions[:5], # Limita a 5 sugestões 'page': page,
"spelling_correction": spelling_correction, 'total_pages': total_pages,
"total_hits": total_hits, 'page_range': range(max(1, page-2), min(total_pages+1, page+3)),
"page": page, 'has_exact_phrases': bool(exact_phrases)
"total_pages": total_pages, })
"page_range": range(max(1, page - 2), min(total_pages + 1, page + 3)),
"has_exact_phrases": bool(exact_phrases),
},
)