modifica a view de pesquisa, o modelo de diario oficial e o documento do diario oficial

2025-03-07 16:32:10 +01:00
parent 6471ee6152
commit 8d1f6feeaf
4 changed files with 203 additions and 204 deletions
--- a/diarios/documents.py
+++ b/diarios/documents.py
@ -65,3 +65,93 @@ class PDFDocumentDocument(Document):
    class Django:
        model = PDFDocument
        fields = ["uploaded_at", "file"]
+
+from django_elasticsearch_dsl import Document, fields
+from django_elasticsearch_dsl.registries import registry
+from .models import DiarioOficial
+
+@registry.register_document
+class DiarioOficialDocument(Document):
+    # Campos principais
+    title = fields.TextField()
+    tipo = fields.KeywordField()
+
+    # Campo para arquivo PDF (se aplicável)
+    arquivo = fields.TextField(attr="arquivo.url")
+
+    # Nested field para páginas (usando o page_content)
+    pages = fields.NestedField(
+        properties={
+            "number": fields.IntegerField(),
+            "content": fields.TextField(analyzer="portuguese")
+        }
+    )
+
+    class Index:
+        name = "diarios_oficiais"
+        settings = {
+            "number_of_shards": 1,
+            "number_of_replicas": 0,
+            "analysis": {
+                "analyzer": {
+                    "portuguese": {
+                        "type": "custom",
+                        "tokenizer": "standard",
+                        "filter": [
+                            "lowercase",
+                            "ascii_folding",
+                            "portuguese_stemmer",
+                            "stop",
+                            "portuguese_synonyms",
+                        ]
+                    },
+                    "portuguese_search": {
+                        "type": "custom",
+                        "tokenizer": "standard",
+                        "filter": [
+                            "lowercase",
+                            "ascii_folding",
+                            "portuguese_stemmer",
+                            "stop",
+                            "suggest_shingle",
+                        ]
+                    }
+                },
+                "filter": {
+                    "suggest_shingle": {
+                        "type": "shingle",
+                        "min_shingle_size": 2,
+                        "max_shingle_size": 3
+                    },
+                    "stop": {"type": "stop", "stopwords": "_portuguese_"},
+                    "ascii_folding": {"type": "asciifolding"},
+                    "portuguese_stemmer": {"type": "stemmer", "language": "portuguese"},
+                    "portuguese_synonyms": {
+                        "type": "synonym",
+                        "synonyms_path": "synonyms.txt",
+                        "expand": True
+                    }
+                }
+            }
+        }
+
+    class Django:
+        model = DiarioOficial
+        fields = [
+            "data",
+            "numero",
+            "link",
+        ]
+
+    def prepare_tipo(self, instance):
+        return instance.tipo.nome if instance.tipo else None
+
+    def prepare_title(self, instance):
+        return f"{instance.tipo.nome if instance.tipo else 'Diário'} {instance.numero}"
+
+    def prepare_pages(self, instance):
+        # Prepara o campo pages usando o page_content
+        if instance.page_content:
+            return instance.page_content  # page_content já é uma lista de dicionários
+        return []
+
--- a/diarios/migrations/0004_remove_diariooficial_finalizado_and_more.py
+++ b/diarios/migrations/0004_remove_diariooficial_finalizado_and_more.py
@ -0,0 +1,39 @@
+# Generated by Django 5.0.12 on 2025-03-07 15:25
+
+import django.core.serializers.json
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("diarios", "0003_tipodiariooficial_diariooficial_and_more"),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name="diariooficial",
+            name="finalizado",
+        ),
+        migrations.AddField(
+            model_name="diariooficial",
+            name="page_content",
+            field=models.JSONField(
+                blank=True,
+                encoder=django.core.serializers.json.DjangoJSONEncoder,
+                null=True,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="diariooficial",
+            name="tipo",
+            field=models.ForeignKey(
+                blank=True,
+                null=True,
+                on_delete=django.db.models.deletion.SET_NULL,
+                related_name="diarios",
+                to="diarios.tipodiariooficial",
+            ),
+        ),
+    ]
--- a/diarios/models.py
+++ b/diarios/models.py
@ -1,6 +1,7 @@
 from django.db import models
 import PyPDF2
 import json
+from django.core.serializers.json import DjangoJSONEncoder


 class PDFDocument(models.Model):
@ -50,12 +51,29 @@ class DiarioOficial(models.Model):
        TipoDiarioOficial,
        blank=True,
        null=True,
-        on_delete=models.CASCADE,
+        on_delete=models.SET_NULL,
        related_name="diarios",
    )
    numero = models.CharField(max_length=20, unique=True)
    link = models.URLField(blank=True, null=True, unique=True)
-    finalizado = models.BooleanField(default=False)
+    page_content = models.JSONField(encoder=DjangoJSONEncoder, blank=True, null=True)
+    
+    def save(self, *args, **kwargs):
+        if self.file:
+            pdf = PyPDF2.PdfReader(self.file)
+            pages_data = []
+
+            for i, pagina in enumerate(pdf.pages):
+                page_text = pagina.extract_text()
+                pages_data.append(
+                    {
+                        "number": i + 1,
+                        "content": page_text,
+                    }
+                )
+            self.page_content = json.dumps(pages_data)
+
+        super().save(*args, **kwargs)
    
    @property
    def data_formatada(self):
@ -71,3 +89,4 @@ class DiarioOficial(models.Model):
    class Meta:
        constraints = [models.UniqueConstraint(fields=["numero"], name="unique_numero")]
        verbose_name_plural = "Diários Oficiais"
+
--- a/diarios/views.py
+++ b/diarios/views.py
@ -39,14 +39,8 @@ def spellcheck_view(request):
    return JsonResponse({"suggestions": suggestions})

 def search_view(request):
-    query = request.GET.get("q", "")  # Obtém o termo de pesquisa da URL
-    page = request.GET.get("page", 1)  # Obtém o valor de "page" (padrão: 1)
-
-    # Converte page para int
-    try:
-        page = int(page)
-    except ValueError:
-        page = 1  # Valor padrão em caso de erro
+    query = request.GET.get('q', '')  # Obtém o termo de pesquisa da URL
+    page = int(request.GET.get('page', 1))

    results = []
    suggestions = []
@ -61,13 +55,13 @@ def search_view(request):
        # Remove os termos entre aspas da consulta principal
        cleaned_query = query
        for phrase in exact_phrases:
-            cleaned_query = cleaned_query.replace(f'"{phrase}"', "")
+            cleaned_query = cleaned_query.replace(f'"{phrase}"', '')

        # Remove espaços extras e pontuação desnecessária
-        cleaned_query = re.sub(r"\s+", " ", cleaned_query).strip()
+        cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()

        # Cria uma consulta no Elasticsearch
-        search = Search(index="pdf_documents")
+        search = Search(index='diarios_oficiais')

        # Lista para armazenar todas as consultas
        queries = []
@ -75,65 +69,42 @@ def search_view(request):
        # Adiciona consulta para termos gerais (com fuzziness para tolerância a erros)
        if cleaned_query:
            queries.append(
-                Q(
-                    "multi_match",
-                    query=cleaned_query,
-                    fields=["title^3", "content^2", "synonyms^1"],
-                    fuzziness="AUTO",
-                    boost=2,
-                )
+                Q('multi_match',
+                  query=cleaned_query,
+                  fields=['title^3', 'pages.content^2'],
+                  fuzziness='AUTO',
+                  boost=2)
            )
-            queries.append(Q("match", synonyms={"query": cleaned_query, "boost": 0.5}))

        # Adiciona consultas exatas para frases entre aspas (sem fuzziness)
        for phrase in exact_phrases:
            if phrase.strip():
-                # Consulta de frase exata para o título com peso alto
                queries.append(
-                    Q(
-                        "match_phrase",
-                        title={
-                            "query": phrase,
-                            "boost": 3,
-                            "slop": 0,  # Sem flexibilidade na ordem das palavras
-                        },
-                    )
-                )
-
-                # Consulta de frase exata para o conteúdo com peso médio
-                queries.append(
-                    Q(
-                        "match_phrase",
-                        content={
-                            "query": phrase,
-                            "boost": 2,
-                            "slop": 0,  # Sem flexibilidade na ordem das palavras
-                        },
-                    )
+                    Q('match_phrase',
+                      pages__content={
+                          'query': phrase,
+                          'boost': 2,
+                          'slop': 0  # Sem flexibilidade na ordem das palavras
+                      })
                )

        # Combina as consultas com OR (se houver alguma)
        if queries:
-            search = search.query(Q("bool", should=queries, minimum_should_match=1))
+            search = search.query(
+                Q('bool', should=queries, minimum_should_match=1)
+            )

            # Configuração do highlight para mostrar mais contexto
            search = search.highlight(
-                "content",
+                'pages.content',
                fragment_size=300,
                number_of_fragments=2,
-                pre_tags=["<mark>"],
-                post_tags=["</mark>"],
-            )
-            search = search.highlight(
-                "title",
-                fragment_size=300,
-                number_of_fragments=1,
-                pre_tags=["<mark>"],
-                post_tags=["</mark>"],
+                pre_tags=['<mark>'],
+                post_tags=['</mark>']
            )

            # Paginação
-            search = search[(page - 1) * per_page : page * per_page]
+            search = search[(page-1)*per_page:page*per_page]

            # Executa a consulta
            response = search.execute()
@ -141,168 +112,48 @@ def search_view(request):

            # Processa os resultados
            for hit in response:
-                # Obter o objeto PDFDocument correspondente
-                try:
-                    pdf_doc = PDFDocument.objects.get(id=hit.meta.id)
-                    pdf_url = pdf_doc.file.url  # URL do PDF
-
-                    matching_pages = []
-                    if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
-                        for highlight in hit.meta.highlight['pages.content']:
-                            page_matches = re.findall(r'page_(\d+)', highlight)
-                            if page_matches:
-                                matching_pages.append(int(page_matches[0]))
-                    if not matching_pages and query:
-                        if pdf_doc.page_content:
-                            try:
-                                page_data = json.loads(pdf_doc.page_content)
-                                for page_d in page_data:
-                                    if query.lower() in page_d['content'].lower():
-                                        matching_pages.append(page_d['number'])
-                            except json.JSONDecodeError as e:
-                                logger.error(f"Erro ao decodificar JSON para o documento {pdf_doc.id}: {e}")
-                                page_data = []
-                        else:
-                            page_data = []
-                    
-                    matching_pages = sorted(list(set(matching_pages)))
-                except PDFDocument.DoesNotExist:
-                    pdf_url = ""
-                    matching_pages = []
-
                # Extrai o conteúdo destacado ou usa o original
-                if hasattr(hit.meta, "highlight") and hasattr(
-                    hit.meta.highlight, "content"
-                ):
-                    highlighted_content = " ... ".join(hit.meta.highlight.content)
+                if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
+                    highlighted_content = ' ... '.join(hit.meta.highlight['pages.content'])
                else:
-                    # Se não houver highlight, pegue os primeiros 300 caracteres
-                    highlighted_content = (
-                        hit.content[:300] + "..."
-                        if len(hit.content) > 300
-                        else hit.content
-                    )
+                    highlighted_content = ""

                # Extrai o título destacado ou usa o original
-                if hasattr(hit.meta, "highlight") and hasattr(
-                    hit.meta.highlight, "title"
-                ):
+                if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'):
                    highlighted_title = hit.meta.highlight.title[0]
                else:
                    highlighted_title = hit.title

                # Verifica se o resultado corresponde a uma frase exata
-                is_exact_match = any(
-                    phrase.lower() in hit.content.lower()
-                    or phrase.lower() in hit.title.lower()
-                    for phrase in exact_phrases
-                )
+                is_exact_match = any(phrase.lower() in hit.pages.content.lower() or
+                                    phrase.lower() in hit.title.lower()
+                                    for phrase in exact_phrases)

-                results.append(
-                    {
-                        "id": hit.meta.id,
-                        "title": hit.title,
-                        "highlighted_title": highlighted_title,
-                        "highlighted_content": highlighted_content,
-                        "uploaded_at": hit.uploaded_at,
-                        "score": hit.meta.score,
-                        "is_exact_match": is_exact_match,
-                        "pdf_url": pdf_url,
-                        "matching_pages": matching_pages,
-                    }
-                )
-
-            # Sugestões "Você quis dizer" (apenas para termos fora de aspas)
-            if total_hits < 5 and cleaned_query:
-                suggestion_search = Search(index="pdf_documents")
-                suggestion_search = suggestion_search.suggest(
-                    "term_suggestion",
-                    cleaned_query,
-                    term={"field": "content", "suggest_mode": "popular", "size": 5},
-                )
-                suggestion_response = suggestion_search.execute()
-
-                if hasattr(suggestion_response, "suggest") and hasattr(
-                    suggestion_response.suggest, "term_suggestion"
-                ):
-                    for suggestion in suggestion_response.suggest.term_suggestion:
-                        for option in suggestion.options:
-                            suggestions.append(option.text)
-
-                    # Cria uma correção ortográfica se necessário
-                    if suggestions and total_hits == 0:
-                        corrected_query = cleaned_query
-                        for (
-                            suggestion_term
-                        ) in suggestion_response.suggest.term_suggestion:
-                            if suggestion_term.options:
-                                # Substitui palavras incorretas por sugestões
-                                word_to_replace = suggestion_term.text
-                                corrected_word = suggestion_term.options[0].text
-                                corrected_query = re.sub(
-                                    r"\b" + re.escape(word_to_replace) + r"\b",
-                                    corrected_word,
-                                    corrected_query,
-                                    flags=re.IGNORECASE,
-                                )
-
-                        # Reconstrói a consulta original mantendo as frases entre aspas
-                        if corrected_query != cleaned_query:
-                            spelling_correction = corrected_query
-                            for phrase in exact_phrases:
-                                spelling_correction += f' "{phrase}"'
-                            spelling_correction = spelling_correction.strip()
-
-            # Busca por termos relacionados (apenas se houver poucos resultados)
-            if total_hits < 3 and cleaned_query:
-                related_terms = Search(index="pdf_documents")
-                related_terms = related_terms.query(
-                    "more_like_this",
-                    fields=["content", "title"],
-                    like=cleaned_query,
-                    min_term_freq=1,
-                    max_query_terms=10,
-                    min_doc_freq=1,
-                )
-                related_terms = related_terms[:5]
-                related_response = related_terms.execute()
-
-                for hit in related_response:
-                    # Verifica se este documento já está nos resultados
-                    if not any(r.get("id") == hit.meta.id for r in results):
-                        results.append(
-                            {
-                                "id": hit.meta.id,
-                                "title": hit.title,
-                                "highlighted_title": hit.title,
-                                "highlighted_content": (
-                                    hit.content[:300] + "..."
-                                    if len(hit.content) > 300
-                                    else hit.content
-                                ),
-                                "uploaded_at": hit.uploaded_at,
-                                "score": hit.meta.score,
-                                "is_related": True,
-                                "pdf_url": pdf_url,
-                            }
-                        )
+                results.append({
+                    'id': hit.meta.id,
+                    'title': hit.title,
+                    'highlighted_title': highlighted_title,
+                    'highlighted_content': highlighted_content,
+                    'data': hit.data,
+                    'numero': hit.numero,
+                    'link': hit.link,
+                    'finalizado': hit.finalizado,
+                    'is_exact_match': is_exact_match
+                })

    # Calcula a paginação
    total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0

    # Renderiza o template com os resultados
-    return render(
-        request,
-        "diarios/search_results.html",
-        {
-            "query": query,
-            "results": results,
-            "suggestions": suggestions[:5],  # Limita a 5 sugestões
-            "spelling_correction": spelling_correction,
-            "total_hits": total_hits,
-            "page": page,
-            "total_pages": total_pages,
-            "page_range": range(max(1, page - 2), min(total_pages + 1, page + 3)),
-            "has_exact_phrases": bool(exact_phrases),
-        },
-    )
+    return render(request, 'diarios/search_results.html', {
+        'query': query,
+        'results': results,
+        'suggestions': suggestions[:5],  # Limita a 5 sugestões
+        'spelling_correction': spelling_correction,
+        'total_hits': total_hits,
+        'page': page,
+        'total_pages': total_pages,
+        'page_range': range(max(1, page-2), min(total_pages+1, page+3)),
+        'has_exact_phrases': bool(exact_phrases)
+    })
+