diff --git a/diarios/documents.py b/diarios/documents.py index 6fc178a..a05ea81 100644 --- a/diarios/documents.py +++ b/diarios/documents.py @@ -65,3 +65,93 @@ class PDFDocumentDocument(Document): class Django: model = PDFDocument fields = ["uploaded_at", "file"] + +from django_elasticsearch_dsl import Document, fields +from django_elasticsearch_dsl.registries import registry +from .models import DiarioOficial + +@registry.register_document +class DiarioOficialDocument(Document): + # Campos principais + title = fields.TextField() + tipo = fields.KeywordField() + + # Campo para arquivo PDF (se aplicável) + arquivo = fields.TextField(attr="arquivo.url") + + # Nested field para páginas (usando o page_content) + pages = fields.NestedField( + properties={ + "number": fields.IntegerField(), + "content": fields.TextField(analyzer="portuguese") + } + ) + + class Index: + name = "diarios_oficiais" + settings = { + "number_of_shards": 1, + "number_of_replicas": 0, + "analysis": { + "analyzer": { + "portuguese": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "ascii_folding", + "portuguese_stemmer", + "stop", + "portuguese_synonyms", + ] + }, + "portuguese_search": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "ascii_folding", + "portuguese_stemmer", + "stop", + "suggest_shingle", + ] + } + }, + "filter": { + "suggest_shingle": { + "type": "shingle", + "min_shingle_size": 2, + "max_shingle_size": 3 + }, + "stop": {"type": "stop", "stopwords": "_portuguese_"}, + "ascii_folding": {"type": "asciifolding"}, + "portuguese_stemmer": {"type": "stemmer", "language": "portuguese"}, + "portuguese_synonyms": { + "type": "synonym", + "synonyms_path": "synonyms.txt", + "expand": True + } + } + } + } + + class Django: + model = DiarioOficial + fields = [ + "data", + "numero", + "link", + ] + + def prepare_tipo(self, instance): + return instance.tipo.nome if instance.tipo else None + + def prepare_title(self, instance): + return f"{instance.tipo.nome if instance.tipo else 'Diário'} {instance.numero}" + + def prepare_pages(self, instance): + # Prepara o campo pages usando o page_content + if instance.page_content: + return instance.page_content # page_content já é uma lista de dicionários + return [] + diff --git a/diarios/migrations/0004_remove_diariooficial_finalizado_and_more.py b/diarios/migrations/0004_remove_diariooficial_finalizado_and_more.py new file mode 100644 index 0000000..5feaa8f --- /dev/null +++ b/diarios/migrations/0004_remove_diariooficial_finalizado_and_more.py @@ -0,0 +1,39 @@ +# Generated by Django 5.0.12 on 2025-03-07 15:25 + +import django.core.serializers.json +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("diarios", "0003_tipodiariooficial_diariooficial_and_more"), + ] + + operations = [ + migrations.RemoveField( + model_name="diariooficial", + name="finalizado", + ), + migrations.AddField( + model_name="diariooficial", + name="page_content", + field=models.JSONField( + blank=True, + encoder=django.core.serializers.json.DjangoJSONEncoder, + null=True, + ), + ), + migrations.AlterField( + model_name="diariooficial", + name="tipo", + field=models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="diarios", + to="diarios.tipodiariooficial", + ), + ), + ] diff --git a/diarios/models.py b/diarios/models.py index 50427fc..3afc642 100644 --- a/diarios/models.py +++ b/diarios/models.py @@ -1,6 +1,7 @@ from django.db import models import PyPDF2 import json +from django.core.serializers.json import DjangoJSONEncoder class PDFDocument(models.Model): @@ -50,13 +51,30 @@ class DiarioOficial(models.Model): TipoDiarioOficial, blank=True, null=True, - on_delete=models.CASCADE, + on_delete=models.SET_NULL, related_name="diarios", ) numero = models.CharField(max_length=20, unique=True) link = models.URLField(blank=True, null=True, unique=True) - finalizado = models.BooleanField(default=False) + page_content = models.JSONField(encoder=DjangoJSONEncoder, blank=True, null=True) + + def save(self, *args, **kwargs): + if self.file: + pdf = PyPDF2.PdfReader(self.file) + pages_data = [] + for i, pagina in enumerate(pdf.pages): + page_text = pagina.extract_text() + pages_data.append( + { + "number": i + 1, + "content": page_text, + } + ) + self.page_content = json.dumps(pages_data) + + super().save(*args, **kwargs) + @property def data_formatada(self): return format_date(self.data, format="long", locale="pt_BR") @@ -71,3 +89,4 @@ class DiarioOficial(models.Model): class Meta: constraints = [models.UniqueConstraint(fields=["numero"], name="unique_numero")] verbose_name_plural = "Diários Oficiais" + diff --git a/diarios/views.py b/diarios/views.py index a43babc..edd3176 100644 --- a/diarios/views.py +++ b/diarios/views.py @@ -39,14 +39,8 @@ def spellcheck_view(request): return JsonResponse({"suggestions": suggestions}) def search_view(request): - query = request.GET.get("q", "") # Obtém o termo de pesquisa da URL - page = request.GET.get("page", 1) # Obtém o valor de "page" (padrão: 1) - - # Converte page para int - try: - page = int(page) - except ValueError: - page = 1 # Valor padrão em caso de erro + query = request.GET.get('q', '') # Obtém o termo de pesquisa da URL + page = int(request.GET.get('page', 1)) results = [] suggestions = [] @@ -61,13 +55,13 @@ def search_view(request): # Remove os termos entre aspas da consulta principal cleaned_query = query for phrase in exact_phrases: - cleaned_query = cleaned_query.replace(f'"{phrase}"', "") + cleaned_query = cleaned_query.replace(f'"{phrase}"', '') # Remove espaços extras e pontuação desnecessária - cleaned_query = re.sub(r"\s+", " ", cleaned_query).strip() + cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip() # Cria uma consulta no Elasticsearch - search = Search(index="pdf_documents") + search = Search(index='diarios_oficiais') # Lista para armazenar todas as consultas queries = [] @@ -75,65 +69,42 @@ def search_view(request): # Adiciona consulta para termos gerais (com fuzziness para tolerância a erros) if cleaned_query: queries.append( - Q( - "multi_match", - query=cleaned_query, - fields=["title^3", "content^2", "synonyms^1"], - fuzziness="AUTO", - boost=2, - ) + Q('multi_match', + query=cleaned_query, + fields=['title^3', 'pages.content^2'], + fuzziness='AUTO', + boost=2) ) - queries.append(Q("match", synonyms={"query": cleaned_query, "boost": 0.5})) # Adiciona consultas exatas para frases entre aspas (sem fuzziness) for phrase in exact_phrases: if phrase.strip(): - # Consulta de frase exata para o título com peso alto queries.append( - Q( - "match_phrase", - title={ - "query": phrase, - "boost": 3, - "slop": 0, # Sem flexibilidade na ordem das palavras - }, - ) - ) - - # Consulta de frase exata para o conteúdo com peso médio - queries.append( - Q( - "match_phrase", - content={ - "query": phrase, - "boost": 2, - "slop": 0, # Sem flexibilidade na ordem das palavras - }, - ) + Q('match_phrase', + pages__content={ + 'query': phrase, + 'boost': 2, + 'slop': 0 # Sem flexibilidade na ordem das palavras + }) ) # Combina as consultas com OR (se houver alguma) if queries: - search = search.query(Q("bool", should=queries, minimum_should_match=1)) + search = search.query( + Q('bool', should=queries, minimum_should_match=1) + ) # Configuração do highlight para mostrar mais contexto search = search.highlight( - "content", + 'pages.content', fragment_size=300, number_of_fragments=2, - pre_tags=[""], - post_tags=[""], - ) - search = search.highlight( - "title", - fragment_size=300, - number_of_fragments=1, - pre_tags=[""], - post_tags=[""], + pre_tags=[''], + post_tags=[''] ) # Paginação - search = search[(page - 1) * per_page : page * per_page] + search = search[(page-1)*per_page:page*per_page] # Executa a consulta response = search.execute() @@ -141,168 +112,48 @@ def search_view(request): # Processa os resultados for hit in response: - # Obter o objeto PDFDocument correspondente - try: - pdf_doc = PDFDocument.objects.get(id=hit.meta.id) - pdf_url = pdf_doc.file.url # URL do PDF - - matching_pages = [] - if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'): - for highlight in hit.meta.highlight['pages.content']: - page_matches = re.findall(r'page_(\d+)', highlight) - if page_matches: - matching_pages.append(int(page_matches[0])) - if not matching_pages and query: - if pdf_doc.page_content: - try: - page_data = json.loads(pdf_doc.page_content) - for page_d in page_data: - if query.lower() in page_d['content'].lower(): - matching_pages.append(page_d['number']) - except json.JSONDecodeError as e: - logger.error(f"Erro ao decodificar JSON para o documento {pdf_doc.id}: {e}") - page_data = [] - else: - page_data = [] - - matching_pages = sorted(list(set(matching_pages))) - except PDFDocument.DoesNotExist: - pdf_url = "" - matching_pages = [] - # Extrai o conteúdo destacado ou usa o original - if hasattr(hit.meta, "highlight") and hasattr( - hit.meta.highlight, "content" - ): - highlighted_content = " ... ".join(hit.meta.highlight.content) + if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'): + highlighted_content = ' ... '.join(hit.meta.highlight['pages.content']) else: - # Se não houver highlight, pegue os primeiros 300 caracteres - highlighted_content = ( - hit.content[:300] + "..." - if len(hit.content) > 300 - else hit.content - ) + highlighted_content = "" # Extrai o título destacado ou usa o original - if hasattr(hit.meta, "highlight") and hasattr( - hit.meta.highlight, "title" - ): + if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'): highlighted_title = hit.meta.highlight.title[0] else: highlighted_title = hit.title # Verifica se o resultado corresponde a uma frase exata - is_exact_match = any( - phrase.lower() in hit.content.lower() - or phrase.lower() in hit.title.lower() - for phrase in exact_phrases - ) + is_exact_match = any(phrase.lower() in hit.pages.content.lower() or + phrase.lower() in hit.title.lower() + for phrase in exact_phrases) - results.append( - { - "id": hit.meta.id, - "title": hit.title, - "highlighted_title": highlighted_title, - "highlighted_content": highlighted_content, - "uploaded_at": hit.uploaded_at, - "score": hit.meta.score, - "is_exact_match": is_exact_match, - "pdf_url": pdf_url, - "matching_pages": matching_pages, - } - ) - - # Sugestões "Você quis dizer" (apenas para termos fora de aspas) - if total_hits < 5 and cleaned_query: - suggestion_search = Search(index="pdf_documents") - suggestion_search = suggestion_search.suggest( - "term_suggestion", - cleaned_query, - term={"field": "content", "suggest_mode": "popular", "size": 5}, - ) - suggestion_response = suggestion_search.execute() - - if hasattr(suggestion_response, "suggest") and hasattr( - suggestion_response.suggest, "term_suggestion" - ): - for suggestion in suggestion_response.suggest.term_suggestion: - for option in suggestion.options: - suggestions.append(option.text) - - # Cria uma correção ortográfica se necessário - if suggestions and total_hits == 0: - corrected_query = cleaned_query - for ( - suggestion_term - ) in suggestion_response.suggest.term_suggestion: - if suggestion_term.options: - # Substitui palavras incorretas por sugestões - word_to_replace = suggestion_term.text - corrected_word = suggestion_term.options[0].text - corrected_query = re.sub( - r"\b" + re.escape(word_to_replace) + r"\b", - corrected_word, - corrected_query, - flags=re.IGNORECASE, - ) - - # Reconstrói a consulta original mantendo as frases entre aspas - if corrected_query != cleaned_query: - spelling_correction = corrected_query - for phrase in exact_phrases: - spelling_correction += f' "{phrase}"' - spelling_correction = spelling_correction.strip() - - # Busca por termos relacionados (apenas se houver poucos resultados) - if total_hits < 3 and cleaned_query: - related_terms = Search(index="pdf_documents") - related_terms = related_terms.query( - "more_like_this", - fields=["content", "title"], - like=cleaned_query, - min_term_freq=1, - max_query_terms=10, - min_doc_freq=1, - ) - related_terms = related_terms[:5] - related_response = related_terms.execute() - - for hit in related_response: - # Verifica se este documento já está nos resultados - if not any(r.get("id") == hit.meta.id for r in results): - results.append( - { - "id": hit.meta.id, - "title": hit.title, - "highlighted_title": hit.title, - "highlighted_content": ( - hit.content[:300] + "..." - if len(hit.content) > 300 - else hit.content - ), - "uploaded_at": hit.uploaded_at, - "score": hit.meta.score, - "is_related": True, - "pdf_url": pdf_url, - } - ) + results.append({ + 'id': hit.meta.id, + 'title': hit.title, + 'highlighted_title': highlighted_title, + 'highlighted_content': highlighted_content, + 'data': hit.data, + 'numero': hit.numero, + 'link': hit.link, + 'finalizado': hit.finalizado, + 'is_exact_match': is_exact_match + }) # Calcula a paginação total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0 # Renderiza o template com os resultados - return render( - request, - "diarios/search_results.html", - { - "query": query, - "results": results, - "suggestions": suggestions[:5], # Limita a 5 sugestões - "spelling_correction": spelling_correction, - "total_hits": total_hits, - "page": page, - "total_pages": total_pages, - "page_range": range(max(1, page - 2), min(total_pages + 1, page + 3)), - "has_exact_phrases": bool(exact_phrases), - }, - ) + return render(request, 'diarios/search_results.html', { + 'query': query, + 'results': results, + 'suggestions': suggestions[:5], # Limita a 5 sugestões + 'spelling_correction': spelling_correction, + 'total_hits': total_hits, + 'page': page, + 'total_pages': total_pages, + 'page_range': range(max(1, page-2), min(total_pages+1, page+3)), + 'has_exact_phrases': bool(exact_phrases) + }) +