adiciona o buscador e cria views e templates para ele

2025-03-07 15:31:53 +01:00
parent 1cd93f7955
commit 3f5ac79051
18 changed files with 378 additions and 235 deletions
--- a/diarios/views.py
+++ b/diarios/views.py
@ -1,9 +1,11 @@
+import json
+import debugpy
 from django.shortcuts import render
 from elasticsearch_dsl import Search, Q
 from elasticsearch_dsl.connections import connections
 from django.conf import settings
 import re
-
+from .documents import PDFDocument
 from django.http import JsonResponse


@ -12,224 +14,295 @@ connections.create_connection(hosts=[settings.ELASTICSEARCH_HOSTS])


 def spellcheck_view(request):
-    query = request.GET.get('q', '')
+    query = request.GET.get("q", "")
    suggestions = []
-    
+
    if query:
-        s = Search(index='pdf_documents')
-        s = s.suggest('auto_correct', query, 
-                     phrase={
-                         'field': 'suggest',
-                         'size': 3,
-                         'gram_size': 3,
-                         'confidence': 2.0,
-                         'direct_generator': [{
-                             'field': 'suggest',
-                             'suggest_mode': 'popular'
-                         }]
-                     })
+        s = Search(index="pdf_documents")
+        s = s.suggest(
+            "auto_correct",
+            query,
+            phrase={
+                "field": "suggest",
+                "size": 3,
+                "gram_size": 3,
+                "confidence": 2.0,
+                "direct_generator": [{"field": "suggest", "suggest_mode": "popular"}],
+            },
+        )
        response = s.execute()
-        
-        if hasattr(response.suggest, 'auto_correct'):
+
+        if hasattr(response.suggest, "auto_correct"):
            for option in response.suggest.auto_correct[0].options:
                suggestions.append(option.text)
-    
-    return JsonResponse({'suggestions': suggestions})
+
+    return JsonResponse({"suggestions": suggestions})

 def search_view(request):
-    query = request.GET.get('q', '')  # Obtém o termo de pesquisa da URL
-    page = int(request.GET.get('page', 1))
-    
+    query = request.GET.get("q", "")  # Obtém o termo de pesquisa da URL
+    page = request.GET.get("page", 1)  # Obtém o valor de "page" (padrão: 1)
+
+    # Converte page para int
+    try:
+        page = int(page)
+    except ValueError:
+        page = 1  # Valor padrão em caso de erro
+
    results = []
    suggestions = []
    spelling_correction = None
    total_hits = 0
    per_page = 10
-    
+
    if query:
        # Processamento especial para termos entre aspas
        exact_phrases = re.findall(r'"([^"]*)"', query)
-        
+
        # Remove os termos entre aspas da consulta principal
        cleaned_query = query
        for phrase in exact_phrases:
-            cleaned_query = cleaned_query.replace(f'"{phrase}"', '')
-        
+            cleaned_query = cleaned_query.replace(f'"{phrase}"', "")
+
        # Remove espaços extras e pontuação desnecessária
-        cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
-        
+        cleaned_query = re.sub(r"\s+", " ", cleaned_query).strip()
+
        # Cria uma consulta no Elasticsearch
-        search = Search(index='pdf_documents')
-        
+        search = Search(index="pdf_documents")
+
        # Lista para armazenar todas as consultas
        queries = []
-        
+
        # Adiciona consulta para termos gerais (com fuzziness para tolerância a erros)
        if cleaned_query:
            queries.append(
-                Q('multi_match',
-                  query=cleaned_query,
-                  fields=['title^3', 'content^2', 'synonyms^1'],
-                  fuzziness='AUTO',
-                  boost=2,)
-                  )
-            queries.append(
-                Q('match',
-                  synonyms={
-                      'query': cleaned_query,
-                      'boost': 0.5
-                  })
-                  )
-                
+                Q(
+                    "multi_match",
+                    query=cleaned_query,
+                    fields=["title^3", "content^2", "synonyms^1"],
+                    fuzziness="AUTO",
+                    boost=2,
+                )
+            )
+            queries.append(Q("match", synonyms={"query": cleaned_query, "boost": 0.5}))
+
        # Adiciona consultas exatas para frases entre aspas (sem fuzziness)
        for phrase in exact_phrases:
            if phrase.strip():
                # Consulta de frase exata para o título com peso alto
                queries.append(
-                    Q('match_phrase',
-                      title={
-                          'query': phrase,
-                          'boost': 3,
-                          'slop': 0  # Sem flexibilidade na ordem das palavras
-                      })
+                    Q(
+                        "match_phrase",
+                        title={
+                            "query": phrase,
+                            "boost": 3,
+                            "slop": 0,  # Sem flexibilidade na ordem das palavras
+                        },
+                    )
                )
-                
+
                # Consulta de frase exata para o conteúdo com peso médio
                queries.append(
-                    Q('match_phrase',
-                      content={
-                          'query': phrase,
-                          'boost': 2,
-                          'slop': 0  # Sem flexibilidade na ordem das palavras
-                      })
+                    Q(
+                        "match_phrase",
+                        content={
+                            "query": phrase,
+                            "boost": 2,
+                            "slop": 0,  # Sem flexibilidade na ordem das palavras
+                        },
+                    )
                )
-        
+
        # Combina as consultas com OR (se houver alguma)
        if queries:
-            search = search.query(
-                Q('bool', should=queries, minimum_should_match=1)
-            )
-            
+            search = search.query(Q("bool", should=queries, minimum_should_match=1))
+
            # Configuração do highlight para mostrar mais contexto
-            search = search.highlight('content', fragment_size=300, number_of_fragments=2, pre_tags=['<mark>'], post_tags=['</mark>'])
-            search = search.highlight('title', fragment_size=300, number_of_fragments=1, pre_tags=['<mark>'], post_tags=['</mark>'])
-            
+            search = search.highlight(
+                "content",
+                fragment_size=300,
+                number_of_fragments=2,
+                pre_tags=["<mark>"],
+                post_tags=["</mark>"],
+            )
+            search = search.highlight(
+                "title",
+                fragment_size=300,
+                number_of_fragments=1,
+                pre_tags=["<mark>"],
+                post_tags=["</mark>"],
+            )
+
            # Paginação
-            search = search[(page-1)*per_page:page*per_page]
-            
+            search = search[(page - 1) * per_page : page * per_page]
+
            # Executa a consulta
            response = search.execute()
            total_hits = response.hits.total.value
-            
+
            # Processa os resultados
            for hit in response:
+                # Obter o objeto PDFDocument correspondente
+                try:
+                    pdf_doc = PDFDocument.objects.get(id=hit.meta.id)
+                    pdf_url = pdf_doc.file.url  # URL do PDF
+
+                    matching_pages = []
+                    if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
+                        for highlight in hit.meta.highlight['pages.content']:
+                            page_matches = re.findall(r'page_(\d+)', highlight)
+                            if page_matches:
+                                matching_pages.append(int(page_matches[0]))
+                    if not matching_pages and query:
+                        if pdf_doc.page_content:
+                            try:
+                                page_data = json.loads(pdf_doc.page_content)
+                                for page_d in page_data:
+                                    if query.lower() in page_d['content'].lower():
+                                        matching_pages.append(page_d['number'])
+                            except json.JSONDecodeError as e:
+                                logger.error(f"Erro ao decodificar JSON para o documento {pdf_doc.id}: {e}")
+                                page_data = []
+                        else:
+                            page_data = []
+                    
+                    matching_pages = sorted(list(set(matching_pages)))
+                except PDFDocument.DoesNotExist:
+                    pdf_url = ""
+                    matching_pages = []
+
                # Extrai o conteúdo destacado ou usa o original
-                if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'content'):
-                    highlighted_content = ' ... '.join(hit.meta.highlight.content)
+                if hasattr(hit.meta, "highlight") and hasattr(
+                    hit.meta.highlight, "content"
+                ):
+                    highlighted_content = " ... ".join(hit.meta.highlight.content)
                else:
                    # Se não houver highlight, pegue os primeiros 300 caracteres
-                    highlighted_content = hit.content[:300] + '...' if len(hit.content) > 300 else hit.content
-                    
+                    highlighted_content = (
+                        hit.content[:300] + "..."
+                        if len(hit.content) > 300
+                        else hit.content
+                    )
+
                # Extrai o título destacado ou usa o original
-                if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'):
+                if hasattr(hit.meta, "highlight") and hasattr(
+                    hit.meta.highlight, "title"
+                ):
                    highlighted_title = hit.meta.highlight.title[0]
                else:
                    highlighted_title = hit.title
-                    
+
                # Verifica se o resultado corresponde a uma frase exata
-                is_exact_match = any(phrase.lower() in hit.content.lower() or 
-                                    phrase.lower() in hit.title.lower() 
-                                    for phrase in exact_phrases)
-                
-                results.append({
-                    'id': hit.meta.id,
-                    'title': hit.title,
-                    'highlighted_title': highlighted_title,
-                    'highlighted_content': highlighted_content,
-                    'uploaded_at': hit.uploaded_at,
-                    'score': hit.meta.score,
-                    'is_exact_match': is_exact_match
-                })
-            
-            # Sugestões "Você quis dizer" (apenas para termos fora de aspas)
-            if total_hits < 5 and cleaned_query:
-                suggestion_search = Search(index='pdf_documents')
-                suggestion_search = suggestion_search.suggest(
-                    'term_suggestion',
-                    cleaned_query,
-                    term={
-                        'field': 'content',
-                        'suggest_mode': 'popular',
-                        'size': 5
+                is_exact_match = any(
+                    phrase.lower() in hit.content.lower()
+                    or phrase.lower() in hit.title.lower()
+                    for phrase in exact_phrases
+                )
+
+                results.append(
+                    {
+                        "id": hit.meta.id,
+                        "title": hit.title,
+                        "highlighted_title": highlighted_title,
+                        "highlighted_content": highlighted_content,
+                        "uploaded_at": hit.uploaded_at,
+                        "score": hit.meta.score,
+                        "is_exact_match": is_exact_match,
+                        "pdf_url": pdf_url,
+                        "matching_pages": matching_pages,
                    }
                )
+
+            # Sugestões "Você quis dizer" (apenas para termos fora de aspas)
+            if total_hits < 5 and cleaned_query:
+                suggestion_search = Search(index="pdf_documents")
+                suggestion_search = suggestion_search.suggest(
+                    "term_suggestion",
+                    cleaned_query,
+                    term={"field": "content", "suggest_mode": "popular", "size": 5},
+                )
                suggestion_response = suggestion_search.execute()
-                
-                if hasattr(suggestion_response, 'suggest') and hasattr(suggestion_response.suggest, 'term_suggestion'):
+
+                if hasattr(suggestion_response, "suggest") and hasattr(
+                    suggestion_response.suggest, "term_suggestion"
+                ):
                    for suggestion in suggestion_response.suggest.term_suggestion:
                        for option in suggestion.options:
                            suggestions.append(option.text)
-                    
+
                    # Cria uma correção ortográfica se necessário
                    if suggestions and total_hits == 0:
                        corrected_query = cleaned_query
-                        for suggestion_term in suggestion_response.suggest.term_suggestion:
+                        for (
+                            suggestion_term
+                        ) in suggestion_response.suggest.term_suggestion:
                            if suggestion_term.options:
                                # Substitui palavras incorretas por sugestões
                                word_to_replace = suggestion_term.text
                                corrected_word = suggestion_term.options[0].text
-                                corrected_query = re.sub(r'\b' + re.escape(word_to_replace) + r'\b', 
-                                                       corrected_word, 
-                                                       corrected_query, 
-                                                       flags=re.IGNORECASE)
-                        
+                                corrected_query = re.sub(
+                                    r"\b" + re.escape(word_to_replace) + r"\b",
+                                    corrected_word,
+                                    corrected_query,
+                                    flags=re.IGNORECASE,
+                                )
+
                        # Reconstrói a consulta original mantendo as frases entre aspas
                        if corrected_query != cleaned_query:
                            spelling_correction = corrected_query
                            for phrase in exact_phrases:
                                spelling_correction += f' "{phrase}"'
                            spelling_correction = spelling_correction.strip()
-                
+
            # Busca por termos relacionados (apenas se houver poucos resultados)
            if total_hits < 3 and cleaned_query:
-                related_terms = Search(index='pdf_documents')
+                related_terms = Search(index="pdf_documents")
                related_terms = related_terms.query(
-                    'more_like_this',
-                    fields=['content', 'title'],
+                    "more_like_this",
+                    fields=["content", "title"],
                    like=cleaned_query,
                    min_term_freq=1,
                    max_query_terms=10,
-                    min_doc_freq=1
+                    min_doc_freq=1,
                )
                related_terms = related_terms[:5]
                related_response = related_terms.execute()
-                
+
                for hit in related_response:
                    # Verifica se este documento já está nos resultados
-                    if not any(r.get('id') == hit.meta.id for r in results):
-                        results.append({
-                            'id': hit.meta.id,
-                            'title': hit.title,
-                            'highlighted_title': hit.title,
-                            'highlighted_content': hit.content[:300] + '...' if len(hit.content) > 300 else hit.content,
-                            'uploaded_at': hit.uploaded_at,
-                            'score': hit.meta.score,
-                            'is_related': True
-                        })
-    
+                    if not any(r.get("id") == hit.meta.id for r in results):
+                        results.append(
+                            {
+                                "id": hit.meta.id,
+                                "title": hit.title,
+                                "highlighted_title": hit.title,
+                                "highlighted_content": (
+                                    hit.content[:300] + "..."
+                                    if len(hit.content) > 300
+                                    else hit.content
+                                ),
+                                "uploaded_at": hit.uploaded_at,
+                                "score": hit.meta.score,
+                                "is_related": True,
+                                "pdf_url": pdf_url,
+                            }
+                        )
+
    # Calcula a paginação
    total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0
-    
-    # Renderiza o template com os resultados
-    return render(request, 'diarios/search_results.html', {
-        'query': query,
-        'results': results,
-        'suggestions': suggestions[:5],  # Limita a 5 sugestões
-        'spelling_correction': spelling_correction,
-        'total_hits': total_hits,
-        'page': page,
-        'total_pages': total_pages,
-        'page_range': range(max(1, page-2), min(total_pages+1, page+3)),
-        'has_exact_phrases': bool(exact_phrases)
-    })

+    # Renderiza o template com os resultados
+    return render(
+        request,
+        "diarios/search_results.html",
+        {
+            "query": query,
+            "results": results,
+            "suggestions": suggestions[:5],  # Limita a 5 sugestões
+            "spelling_correction": spelling_correction,
+            "total_hits": total_hits,
+            "page": page,
+            "total_pages": total_pages,
+            "page_range": range(max(1, page - 2), min(total_pages + 1, page + 3)),
+            "has_exact_phrases": bool(exact_phrases),
+        },
+    )