arruma o processo de busca textual nos diarios

2025-03-14 17:36:14 +01:00
parent 8d1f6feeaf
commit f2e5cd73b7
15 changed files with 650 additions and 645 deletions
--- a/diarios/views.py
+++ b/diarios/views.py
@ -1,159 +1,83 @@
-import json
-import debugpy
 from django.shortcuts import render
-from elasticsearch_dsl import Search, Q
-from elasticsearch_dsl.connections import connections
-from django.conf import settings
-import re
-from .documents import PDFDocument
-from django.http import JsonResponse
+from elasticsearch_dsl import Q
+from .documents import DiarioOficialDocument

-
-# Configuração da conexão com o Elasticsearch
-connections.create_connection(hosts=[settings.ELASTICSEARCH_HOSTS])
-
-
-def spellcheck_view(request):
-    query = request.GET.get("q", "")
-    suggestions = []
-
-    if query:
-        s = Search(index="pdf_documents")
-        s = s.suggest(
-            "auto_correct",
-            query,
-            phrase={
-                "field": "suggest",
-                "size": 3,
-                "gram_size": 3,
-                "confidence": 2.0,
-                "direct_generator": [{"field": "suggest", "suggest_mode": "popular"}],
-            },
-        )
-        response = s.execute()
-
-        if hasattr(response.suggest, "auto_correct"):
-            for option in response.suggest.auto_correct[0].options:
-                suggestions.append(option.text)
-
-    return JsonResponse({"suggestions": suggestions})
-
-def search_view(request):
-    query = request.GET.get('q', '')  # Obtém o termo de pesquisa da URL
+def search_diarios(request):
+    q = request.GET.get('q', '')
    page = int(request.GET.get('page', 1))
-
+    size = int(request.GET.get('size', 10))
+    
+    start = (page - 1) * size
+    end = start + size
+    
    results = []
-    suggestions = []
-    spelling_correction = None
-    total_hits = 0
-    per_page = 10
-
-    if query:
-        # Processamento especial para termos entre aspas
-        exact_phrases = re.findall(r'"([^"]*)"', query)
-
-        # Remove os termos entre aspas da consulta principal
-        cleaned_query = query
-        for phrase in exact_phrases:
-            cleaned_query = cleaned_query.replace(f'"{phrase}"', '')
-
-        # Remove espaços extras e pontuação desnecessária
-        cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
-
-        # Cria uma consulta no Elasticsearch
-        search = Search(index='diarios_oficiais')
-
-        # Lista para armazenar todas as consultas
-        queries = []
-
-        # Adiciona consulta para termos gerais (com fuzziness para tolerância a erros)
-        if cleaned_query:
-            queries.append(
-                Q('multi_match',
-                  query=cleaned_query,
-                  fields=['title^3', 'pages.content^2'],
-                  fuzziness='AUTO',
-                  boost=2)
-            )
-
-        # Adiciona consultas exatas para frases entre aspas (sem fuzziness)
-        for phrase in exact_phrases:
-            if phrase.strip():
-                queries.append(
-                    Q('match_phrase',
-                      pages__content={
-                          'query': phrase,
-                          'boost': 2,
-                          'slop': 0  # Sem flexibilidade na ordem das palavras
-                      })
-                )
-
-        # Combina as consultas com OR (se houver alguma)
-        if queries:
-            search = search.query(
-                Q('bool', should=queries, minimum_should_match=1)
-            )
-
-            # Configuração do highlight para mostrar mais contexto
-            search = search.highlight(
-                'pages.content',
-                fragment_size=300,
-                number_of_fragments=2,
-                pre_tags=['<mark>'],
-                post_tags=['</mark>']
-            )
-
-            # Paginação
-            search = search[(page-1)*per_page:page*per_page]
-
-            # Executa a consulta
-            response = search.execute()
-            total_hits = response.hits.total.value
-
-            # Processa os resultados
-            for hit in response:
-                # Extrai o conteúdo destacado ou usa o original
-                if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
-                    highlighted_content = ' ... '.join(hit.meta.highlight['pages.content'])
-                else:
-                    highlighted_content = ""
-
-                # Extrai o título destacado ou usa o original
-                if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'):
-                    highlighted_title = hit.meta.highlight.title[0]
-                else:
-                    highlighted_title = hit.title
-
-                # Verifica se o resultado corresponde a uma frase exata
-                is_exact_match = any(phrase.lower() in hit.pages.content.lower() or
-                                    phrase.lower() in hit.title.lower()
-                                    for phrase in exact_phrases)
-
-                results.append({
-                    'id': hit.meta.id,
-                    'title': hit.title,
-                    'highlighted_title': highlighted_title,
-                    'highlighted_content': highlighted_content,
-                    'data': hit.data,
-                    'numero': hit.numero,
-                    'link': hit.link,
-                    'finalizado': hit.finalizado,
-                    'is_exact_match': is_exact_match
-                })
-
-    # Calcula a paginação
-    total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0
-
-    # Renderiza o template com os resultados
-    return render(request, 'diarios/search_results.html', {
-        'query': query,
+    total = 0
+    
+    if q:
+        # Busca principal com boost para relevância
+        query = Q(
+            'multi_match',
+            query=q,
+            fields=['content^3', 'tipo.nome^2', 'numero', 'pages.content'],
+            fuzziness='AUTO'
+        )
+        
+        # Pesquisa com highlighting
+        search = DiarioOficialDocument.search()
+        search = search.query(query)
+        search = search.highlight('content', fragment_size=150, number_of_fragments=3)
+        search = search.highlight('pages.content', fragment_size=150, number_of_fragments=3)
+        
+        # Paginação
+        search = search[start:end]
+        
+        response = search.execute()
+        
+        total = response.hits.total.value
+        
+        for hit in response:
+            # Adicionar destaque
+            highlight = ""
+            if hasattr(hit.meta, 'highlight'):
+                if 'content' in hit.meta.highlight:
+                    highlight = "...".join(hit.meta.highlight.content)
+                
+            # Processando páginas com destaque
+            highlighted_pages = []
+            if hasattr(hit.meta, 'highlight') and 'pages.content' in hit.meta.highlight:
+                for i, content in enumerate(hit.meta.highlight['pages.content']):
+                    # Encontre a página correspondente
+                    page_number = i + 1  # Lógica simplificada, pode precisar de ajuste
+                    highlighted_pages.append({
+                        'number': page_number,
+                        'content': content
+                    })
+            
+            # Combine dados do documento com os destaques
+            result = {
+                'id': hit.id,
+                'tipo': hit.tipo.nome if hasattr(hit, 'tipo') and hit.tipo else '',
+                'numero': hit.numero,
+                'data': hit.data,
+                'link': hit.link,
+                'highlight': highlight,
+                'highlighted_pages': highlighted_pages
+            }
+            
+            results.append(result)
+    
+    context = {
+        'query': q,
        'results': results,
-        'suggestions': suggestions[:5],  # Limita a 5 sugestões
-        'spelling_correction': spelling_correction,
-        'total_hits': total_hits,
+        'total': total,
        'page': page,
-        'total_pages': total_pages,
-        'page_range': range(max(1, page-2), min(total_pages+1, page+3)),
-        'has_exact_phrases': bool(exact_phrases)
-    })
+        'size': size,
+        'total_pages': (total + size - 1) // size if total > 0 else 0,
+    }
+    
+    return render(request, 'diarios/diarios_search.html', context)
+
+def diario_detail(request, pk):
+    diario = get_object_or_404(Diario, pk=pk)
+    return render(request, 'diarios/diario_detail.html', {'diario': diario})