diff --git a/diarios/documents.py b/diarios/documents.py
index 6fc178a..a05ea81 100644
--- a/diarios/documents.py
+++ b/diarios/documents.py
@@ -65,3 +65,93 @@ class PDFDocumentDocument(Document):
class Django:
model = PDFDocument
fields = ["uploaded_at", "file"]
+
+from django_elasticsearch_dsl import Document, fields
+from django_elasticsearch_dsl.registries import registry
+from .models import DiarioOficial
+
+@registry.register_document
+class DiarioOficialDocument(Document):
+ # Campos principais
+ title = fields.TextField()
+ tipo = fields.KeywordField()
+
+ # Campo para arquivo PDF (se aplicável)
+ arquivo = fields.TextField(attr="arquivo.url")
+
+ # Nested field para páginas (usando o page_content)
+ pages = fields.NestedField(
+ properties={
+ "number": fields.IntegerField(),
+ "content": fields.TextField(analyzer="portuguese")
+ }
+ )
+
+ class Index:
+ name = "diarios_oficiais"
+ settings = {
+ "number_of_shards": 1,
+ "number_of_replicas": 0,
+ "analysis": {
+ "analyzer": {
+ "portuguese": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase",
+ "ascii_folding",
+ "portuguese_stemmer",
+ "stop",
+ "portuguese_synonyms",
+ ]
+ },
+ "portuguese_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase",
+ "ascii_folding",
+ "portuguese_stemmer",
+ "stop",
+ "suggest_shingle",
+ ]
+ }
+ },
+ "filter": {
+ "suggest_shingle": {
+ "type": "shingle",
+ "min_shingle_size": 2,
+ "max_shingle_size": 3
+ },
+ "stop": {"type": "stop", "stopwords": "_portuguese_"},
+ "ascii_folding": {"type": "asciifolding"},
+ "portuguese_stemmer": {"type": "stemmer", "language": "portuguese"},
+ "portuguese_synonyms": {
+ "type": "synonym",
+ "synonyms_path": "synonyms.txt",
+ "expand": True
+ }
+ }
+ }
+ }
+
+ class Django:
+ model = DiarioOficial
+ fields = [
+ "data",
+ "numero",
+ "link",
+ ]
+
+ def prepare_tipo(self, instance):
+ return instance.tipo.nome if instance.tipo else None
+
+ def prepare_title(self, instance):
+ return f"{instance.tipo.nome if instance.tipo else 'Diário'} {instance.numero}"
+
+ def prepare_pages(self, instance):
+ # Prepara o campo pages usando o page_content
+ if instance.page_content:
+ return instance.page_content # page_content já é uma lista de dicionários
+ return []
+
diff --git a/diarios/migrations/0004_remove_diariooficial_finalizado_and_more.py b/diarios/migrations/0004_remove_diariooficial_finalizado_and_more.py
new file mode 100644
index 0000000..5feaa8f
--- /dev/null
+++ b/diarios/migrations/0004_remove_diariooficial_finalizado_and_more.py
@@ -0,0 +1,39 @@
+# Generated by Django 5.0.12 on 2025-03-07 15:25
+
+import django.core.serializers.json
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("diarios", "0003_tipodiariooficial_diariooficial_and_more"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="diariooficial",
+ name="finalizado",
+ ),
+ migrations.AddField(
+ model_name="diariooficial",
+ name="page_content",
+ field=models.JSONField(
+ blank=True,
+ encoder=django.core.serializers.json.DjangoJSONEncoder,
+ null=True,
+ ),
+ ),
+ migrations.AlterField(
+ model_name="diariooficial",
+ name="tipo",
+ field=models.ForeignKey(
+ blank=True,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="diarios",
+ to="diarios.tipodiariooficial",
+ ),
+ ),
+ ]
diff --git a/diarios/models.py b/diarios/models.py
index 50427fc..3afc642 100644
--- a/diarios/models.py
+++ b/diarios/models.py
@@ -1,6 +1,7 @@
from django.db import models
import PyPDF2
import json
+from django.core.serializers.json import DjangoJSONEncoder
class PDFDocument(models.Model):
@@ -50,13 +51,30 @@ class DiarioOficial(models.Model):
TipoDiarioOficial,
blank=True,
null=True,
- on_delete=models.CASCADE,
+ on_delete=models.SET_NULL,
related_name="diarios",
)
numero = models.CharField(max_length=20, unique=True)
link = models.URLField(blank=True, null=True, unique=True)
- finalizado = models.BooleanField(default=False)
+ page_content = models.JSONField(encoder=DjangoJSONEncoder, blank=True, null=True)
+
+ def save(self, *args, **kwargs):
+ if self.file:
+ pdf = PyPDF2.PdfReader(self.file)
+ pages_data = []
+ for i, pagina in enumerate(pdf.pages):
+ page_text = pagina.extract_text()
+ pages_data.append(
+ {
+ "number": i + 1,
+ "content": page_text,
+ }
+ )
+ self.page_content = json.dumps(pages_data)
+
+ super().save(*args, **kwargs)
+
@property
def data_formatada(self):
return format_date(self.data, format="long", locale="pt_BR")
@@ -71,3 +89,4 @@ class DiarioOficial(models.Model):
class Meta:
constraints = [models.UniqueConstraint(fields=["numero"], name="unique_numero")]
verbose_name_plural = "Diários Oficiais"
+
diff --git a/diarios/views.py b/diarios/views.py
index a43babc..edd3176 100644
--- a/diarios/views.py
+++ b/diarios/views.py
@@ -39,14 +39,8 @@ def spellcheck_view(request):
return JsonResponse({"suggestions": suggestions})
def search_view(request):
- query = request.GET.get("q", "") # Obtém o termo de pesquisa da URL
- page = request.GET.get("page", 1) # Obtém o valor de "page" (padrão: 1)
-
- # Converte page para int
- try:
- page = int(page)
- except ValueError:
- page = 1 # Valor padrão em caso de erro
+ query = request.GET.get('q', '') # Obtém o termo de pesquisa da URL
+ page = int(request.GET.get('page', 1))
results = []
suggestions = []
@@ -61,13 +55,13 @@ def search_view(request):
# Remove os termos entre aspas da consulta principal
cleaned_query = query
for phrase in exact_phrases:
- cleaned_query = cleaned_query.replace(f'"{phrase}"', "")
+ cleaned_query = cleaned_query.replace(f'"{phrase}"', '')
# Remove espaços extras e pontuação desnecessária
- cleaned_query = re.sub(r"\s+", " ", cleaned_query).strip()
+ cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
# Cria uma consulta no Elasticsearch
- search = Search(index="pdf_documents")
+ search = Search(index='diarios_oficiais')
# Lista para armazenar todas as consultas
queries = []
@@ -75,65 +69,42 @@ def search_view(request):
# Adiciona consulta para termos gerais (com fuzziness para tolerância a erros)
if cleaned_query:
queries.append(
- Q(
- "multi_match",
- query=cleaned_query,
- fields=["title^3", "content^2", "synonyms^1"],
- fuzziness="AUTO",
- boost=2,
- )
+ Q('multi_match',
+ query=cleaned_query,
+ fields=['title^3', 'pages.content^2'],
+ fuzziness='AUTO',
+ boost=2)
)
- queries.append(Q("match", synonyms={"query": cleaned_query, "boost": 0.5}))
# Adiciona consultas exatas para frases entre aspas (sem fuzziness)
for phrase in exact_phrases:
if phrase.strip():
- # Consulta de frase exata para o título com peso alto
queries.append(
- Q(
- "match_phrase",
- title={
- "query": phrase,
- "boost": 3,
- "slop": 0, # Sem flexibilidade na ordem das palavras
- },
- )
- )
-
- # Consulta de frase exata para o conteúdo com peso médio
- queries.append(
- Q(
- "match_phrase",
- content={
- "query": phrase,
- "boost": 2,
- "slop": 0, # Sem flexibilidade na ordem das palavras
- },
- )
+ Q('match_phrase',
+ pages__content={
+ 'query': phrase,
+ 'boost': 2,
+ 'slop': 0 # Sem flexibilidade na ordem das palavras
+ })
)
# Combina as consultas com OR (se houver alguma)
if queries:
- search = search.query(Q("bool", should=queries, minimum_should_match=1))
+ search = search.query(
+ Q('bool', should=queries, minimum_should_match=1)
+ )
# Configuração do highlight para mostrar mais contexto
search = search.highlight(
- "content",
+ 'pages.content',
fragment_size=300,
number_of_fragments=2,
- pre_tags=[""],
- post_tags=[""],
- )
- search = search.highlight(
- "title",
- fragment_size=300,
- number_of_fragments=1,
- pre_tags=[""],
- post_tags=[""],
+ pre_tags=[''],
+ post_tags=['']
)
# Paginação
- search = search[(page - 1) * per_page : page * per_page]
+ search = search[(page-1)*per_page:page*per_page]
# Executa a consulta
response = search.execute()
@@ -141,168 +112,48 @@ def search_view(request):
# Processa os resultados
for hit in response:
- # Obter o objeto PDFDocument correspondente
- try:
- pdf_doc = PDFDocument.objects.get(id=hit.meta.id)
- pdf_url = pdf_doc.file.url # URL do PDF
-
- matching_pages = []
- if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
- for highlight in hit.meta.highlight['pages.content']:
- page_matches = re.findall(r'page_(\d+)', highlight)
- if page_matches:
- matching_pages.append(int(page_matches[0]))
- if not matching_pages and query:
- if pdf_doc.page_content:
- try:
- page_data = json.loads(pdf_doc.page_content)
- for page_d in page_data:
- if query.lower() in page_d['content'].lower():
- matching_pages.append(page_d['number'])
- except json.JSONDecodeError as e:
- logger.error(f"Erro ao decodificar JSON para o documento {pdf_doc.id}: {e}")
- page_data = []
- else:
- page_data = []
-
- matching_pages = sorted(list(set(matching_pages)))
- except PDFDocument.DoesNotExist:
- pdf_url = ""
- matching_pages = []
-
# Extrai o conteúdo destacado ou usa o original
- if hasattr(hit.meta, "highlight") and hasattr(
- hit.meta.highlight, "content"
- ):
- highlighted_content = " ... ".join(hit.meta.highlight.content)
+ if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
+ highlighted_content = ' ... '.join(hit.meta.highlight['pages.content'])
else:
- # Se não houver highlight, pegue os primeiros 300 caracteres
- highlighted_content = (
- hit.content[:300] + "..."
- if len(hit.content) > 300
- else hit.content
- )
+ highlighted_content = ""
# Extrai o título destacado ou usa o original
- if hasattr(hit.meta, "highlight") and hasattr(
- hit.meta.highlight, "title"
- ):
+ if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'):
highlighted_title = hit.meta.highlight.title[0]
else:
highlighted_title = hit.title
# Verifica se o resultado corresponde a uma frase exata
- is_exact_match = any(
- phrase.lower() in hit.content.lower()
- or phrase.lower() in hit.title.lower()
- for phrase in exact_phrases
- )
+ is_exact_match = any(phrase.lower() in hit.pages.content.lower() or
+ phrase.lower() in hit.title.lower()
+ for phrase in exact_phrases)
- results.append(
- {
- "id": hit.meta.id,
- "title": hit.title,
- "highlighted_title": highlighted_title,
- "highlighted_content": highlighted_content,
- "uploaded_at": hit.uploaded_at,
- "score": hit.meta.score,
- "is_exact_match": is_exact_match,
- "pdf_url": pdf_url,
- "matching_pages": matching_pages,
- }
- )
-
- # Sugestões "Você quis dizer" (apenas para termos fora de aspas)
- if total_hits < 5 and cleaned_query:
- suggestion_search = Search(index="pdf_documents")
- suggestion_search = suggestion_search.suggest(
- "term_suggestion",
- cleaned_query,
- term={"field": "content", "suggest_mode": "popular", "size": 5},
- )
- suggestion_response = suggestion_search.execute()
-
- if hasattr(suggestion_response, "suggest") and hasattr(
- suggestion_response.suggest, "term_suggestion"
- ):
- for suggestion in suggestion_response.suggest.term_suggestion:
- for option in suggestion.options:
- suggestions.append(option.text)
-
- # Cria uma correção ortográfica se necessário
- if suggestions and total_hits == 0:
- corrected_query = cleaned_query
- for (
- suggestion_term
- ) in suggestion_response.suggest.term_suggestion:
- if suggestion_term.options:
- # Substitui palavras incorretas por sugestões
- word_to_replace = suggestion_term.text
- corrected_word = suggestion_term.options[0].text
- corrected_query = re.sub(
- r"\b" + re.escape(word_to_replace) + r"\b",
- corrected_word,
- corrected_query,
- flags=re.IGNORECASE,
- )
-
- # Reconstrói a consulta original mantendo as frases entre aspas
- if corrected_query != cleaned_query:
- spelling_correction = corrected_query
- for phrase in exact_phrases:
- spelling_correction += f' "{phrase}"'
- spelling_correction = spelling_correction.strip()
-
- # Busca por termos relacionados (apenas se houver poucos resultados)
- if total_hits < 3 and cleaned_query:
- related_terms = Search(index="pdf_documents")
- related_terms = related_terms.query(
- "more_like_this",
- fields=["content", "title"],
- like=cleaned_query,
- min_term_freq=1,
- max_query_terms=10,
- min_doc_freq=1,
- )
- related_terms = related_terms[:5]
- related_response = related_terms.execute()
-
- for hit in related_response:
- # Verifica se este documento já está nos resultados
- if not any(r.get("id") == hit.meta.id for r in results):
- results.append(
- {
- "id": hit.meta.id,
- "title": hit.title,
- "highlighted_title": hit.title,
- "highlighted_content": (
- hit.content[:300] + "..."
- if len(hit.content) > 300
- else hit.content
- ),
- "uploaded_at": hit.uploaded_at,
- "score": hit.meta.score,
- "is_related": True,
- "pdf_url": pdf_url,
- }
- )
+ results.append({
+ 'id': hit.meta.id,
+ 'title': hit.title,
+ 'highlighted_title': highlighted_title,
+ 'highlighted_content': highlighted_content,
+ 'data': hit.data,
+ 'numero': hit.numero,
+ 'link': hit.link,
+ 'finalizado': hit.finalizado,
+ 'is_exact_match': is_exact_match
+ })
# Calcula a paginação
total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0
# Renderiza o template com os resultados
- return render(
- request,
- "diarios/search_results.html",
- {
- "query": query,
- "results": results,
- "suggestions": suggestions[:5], # Limita a 5 sugestões
- "spelling_correction": spelling_correction,
- "total_hits": total_hits,
- "page": page,
- "total_pages": total_pages,
- "page_range": range(max(1, page - 2), min(total_pages + 1, page + 3)),
- "has_exact_phrases": bool(exact_phrases),
- },
- )
+ return render(request, 'diarios/search_results.html', {
+ 'query': query,
+ 'results': results,
+ 'suggestions': suggestions[:5], # Limita a 5 sugestões
+ 'spelling_correction': spelling_correction,
+ 'total_hits': total_hits,
+ 'page': page,
+ 'total_pages': total_pages,
+ 'page_range': range(max(1, page-2), min(total_pages+1, page+3)),
+ 'has_exact_phrases': bool(exact_phrases)
+ })
+