diff --git a/compose/local/django/start b/compose/local/django/start index ba96db4..bc81e2a 100644 --- a/compose/local/django/start +++ b/compose/local/django/start @@ -6,4 +6,4 @@ set -o nounset python manage.py migrate -exec python manage.py runserver_plus 0.0.0.0:8000 +exec python manage.py runserver_plus 0.0.0.0:8005 diff --git a/config/settings/base.py b/config/settings/base.py index 8c283c6..68af005 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -82,7 +82,8 @@ THIRD_PARTY_APPS = [ LOCAL_APPS = [ "diários_oficiais_alems.users", - # Your stuff: custom apps go here + "diarios", + 'django_elasticsearch_dsl', ] # https://docs.djangoproject.com/en/dev/ref/settings/#installed-apps INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS @@ -287,7 +288,8 @@ STATICFILES_FINDERS += ["compressor.finders.CompressorFinder"] # Elastic Search # ------------------------------------------------------------------------------ ELASTICSEARCH_DSL = { - 'default': { - 'hosts': env('ELASTICSEARCH_HOSTS', default='localhost:9200') - }, -} \ No newline at end of file + 'default': { + 'hosts': 'http://elasticsearch:9200' # same as above + }, +} +ELASTICSEARCH_HOSTS="http://elasticsearch:9200" diff --git a/config/settings/local.py b/config/settings/local.py index 28700a8..1787e95 100644 --- a/config/settings/local.py +++ b/config/settings/local.py @@ -14,7 +14,7 @@ SECRET_KEY = env( default="tYdYl0MP5zgpMlMmjBuYHvH4Dp3JDN5q3sxWBdFejemZSr0qpI9IrvrvTm17F0aW", ) # https://docs.djangoproject.com/en/dev/ref/settings/#allowed-hosts -ALLOWED_HOSTS = ["localhost", "0.0.0.0", "127.0.0.1"] # noqa: S104 +ALLOWED_HOSTS = ["localhost", "0.0.0.0", "127.0.0.1", "109.199.98.226"] # noqa: S104 # CACHES # ------------------------------------------------------------------------------ diff --git a/config/urls.py b/config/urls.py index b7adf62..c0786ff 100644 --- a/config/urls.py +++ b/config/urls.py @@ -20,7 +20,7 @@ urlpatterns = [ path("users/", include("diários_oficiais_alems.users.urls", namespace="users")), path("accounts/", include("allauth.urls")), # Your stuff: custom urls includes go here - # ... + path("diarios/", include("diarios.urls")), # Media files *static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT), ] diff --git a/diarios/__init__.py b/diarios/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/diarios/admin.py b/diarios/admin.py new file mode 100644 index 0000000..4641b0b --- /dev/null +++ b/diarios/admin.py @@ -0,0 +1,8 @@ +from django.contrib import admin +from .models import PDFDocument +from django.db import models + + +@admin.register(PDFDocument) +class PDFDocumentAdmin(admin.ModelAdmin): + pass diff --git a/diarios/apps.py b/diarios/apps.py new file mode 100644 index 0000000..55164d7 --- /dev/null +++ b/diarios/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class DiariosConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'diarios' diff --git a/diarios/documents.py b/diarios/documents.py new file mode 100644 index 0000000..35bff2d --- /dev/null +++ b/diarios/documents.py @@ -0,0 +1,68 @@ +from django_elasticsearch_dsl import Document, fields +from django_elasticsearch_dsl.registries import registry +from .models import PDFDocument + +@registry.register_document +class PDFDocumentDocument(Document): + title = fields.TextField() + content = fields.TextField(analyzer='portuguese') + + class Index: + name = 'pdf_documents' + settings = { + 'number_of_shards': 1, + 'number_of_replicas': 0, + 'analysis': { + 'analyzer': { + 'portuguese': { + 'type': 'custom', + 'tokenizer': 'standard', + 'filter': [ + 'lowercase', + 'ascii_folding', + 'portuguese_stemmer', + 'stop', + 'portuguese_synonyms', + ] + }, + 'portuguese_search': { + 'type': 'custom', + 'tokenizer': 'standard', + 'filter': [ + 'lowercase', + 'ascii_folding', + 'portuguese_stemmer', + 'stop', + 'suggest_shingle', + ] + } + }, + 'filter': { + 'suggest_shingle': { + 'type': 'shingle', + 'min_shingle_size': 2, + 'max_shingle_size': 3 + }, + 'stop': { + 'type': 'stop', + 'stopwords': '_portuguese_' + }, + 'ascii_folding': { + 'type': 'asciifolding' + }, + 'portuguese_stemmer': { + 'type': 'stemmer', + 'language': 'portuguese' + }, + 'portuguese_synonyms':{ + 'type': 'synonym', + 'synonyms_path': 'synonyms.txt', + 'expand': True + } + } + } + } + + class Django: + model = PDFDocument + fields = ['uploaded_at'] diff --git a/diarios/migrations/0001_initial.py b/diarios/migrations/0001_initial.py new file mode 100644 index 0000000..ec3516f --- /dev/null +++ b/diarios/migrations/0001_initial.py @@ -0,0 +1,24 @@ +# Generated by Django 5.0.12 on 2025-03-06 16:00 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='PDFDocument', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('title', models.CharField(max_length=255)), + ('file', models.FileField(upload_to='pdfs/')), + ('content', models.TextField(blank=True)), + ('uploaded_at', models.DateTimeField(auto_now_add=True)), + ], + ), + ] diff --git a/diarios/migrations/__init__.py b/diarios/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/diarios/models.py b/diarios/models.py new file mode 100644 index 0000000..d5391b9 --- /dev/null +++ b/diarios/models.py @@ -0,0 +1,23 @@ +from django.db import models +import PyPDF2 + + +class PDFDocument(models.Model): + title = models.CharField(max_length=255) + file = models.FileField(upload_to='pdfs/') + content = models.TextField(blank=True) + uploaded_at = models.DateTimeField(auto_now_add=True) + + def __str__(self): + return self.title + + def save(self, *args, **kwargs): + if self.file: + pdf = PyPDF2.PdfReader(self.file) + texto = [] + for pagina in pdf.pages: + texto.append(pagina.extract_text()) + self.content = '\n'.join(texto) + + super().save(*args, **kwargs) + diff --git a/diarios/signals.py b/diarios/signals.py new file mode 100644 index 0000000..bceff3c --- /dev/null +++ b/diarios/signals.py @@ -0,0 +1,18 @@ +from django.db.models.signals import post_save +from django.dispatch import receiver +import PyPDF2 +from io import BytesIO +from .models import PDFDocument + + +@receiver(post_save, sender=PDFDocument) +def extract_text(sender, instance, created, **kwargs): + print("Signal disparado!") # Teste se o Signal está funcionando + if created and instance.file: + pdf = PyPDF2.PdfReader(instance.file) + text = [] + for page in pdf.pages: + text.append(page.extract_text()) + instance.content = '\n'.join(text) + instance.save(update_fields=['content']) + diff --git a/diarios/templates/diarios/search_results.html b/diarios/templates/diarios/search_results.html new file mode 100644 index 0000000..86819f0 --- /dev/null +++ b/diarios/templates/diarios/search_results.html @@ -0,0 +1,310 @@ + + + + + + {% if query %}{{ query }} - {% endif %}Pesquisa de Documentos + + + + + + + +
+ +
+
+ + + + + +
+
+
+ +
+
+ + +
+
+ Use aspas duplas para buscar frases exatas, ex: "documento oficial" +
+
+
+
+
+ +
+ {% if query %} + +
+ {% if total_hits > 0 %} +

Cerca de {{ total_hits }} resultados encontrados para "{{ query }}"

+ {% else %} +

Nenhum resultado encontrado para "{{ query }}"

+ {% endif %} +
+ + + {% if spelling_correction %} +
+

Você quis dizer: {{ spelling_correction }}?

+
+ {% endif %} + + + {% if suggestions %} +
+

Talvez você esteja procurando por: + {% for suggestion in suggestions %} + {{ suggestion }}{% if not forloop.last %}, {% endif %} + {% endfor %} +

+
+ {% endif %} + + + {% if results %} +
+ {% for result in results %} +
+
+ {% if result.is_exact_match %} + Correspondência exata + {% endif %} + {% if result.is_related %} + Termo relacionado + {% endif %} +
+
+ {{ result.highlighted_title|safe }} +
+
{{ result.highlighted_content|safe }}
+
+ {{ result.uploaded_at|date:"d/m/Y" }} +
+
+ {% endfor %} +
+ + + {% if total_pages > 1 %} + + {% endif %} + {% else %} + + {% endif %} + {% else %} + +
+

BuscaDocs

+
+
+
+ + +
+
+

Pesquise em nossa biblioteca de documentos digitalizados

+
+
Dicas de pesquisa:
+
    +
  • Use aspas duplas para buscar frases exatas: "documento oficial"
  • +
  • Tente usar sinônimos se não encontrar resultados
  • +
  • Seja específico para encontrar documentos relevantes
  • +
+
+
+
+ {% endif %} +
+ + + + + + + + + + + + diff --git a/diarios/tests.py b/diarios/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/diarios/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/diarios/urls.py b/diarios/urls.py new file mode 100644 index 0000000..0f0075f --- /dev/null +++ b/diarios/urls.py @@ -0,0 +1,7 @@ +from django.urls import path +from .views import search_view, spellcheck_view + +urlpatterns = [ + path('pesquisa/', search_view, name='search_view'), + path('spellcheck/', spellcheck_view, name='spellcheck_view'), +] diff --git a/diarios/views.py b/diarios/views.py new file mode 100644 index 0000000..49ef3e3 --- /dev/null +++ b/diarios/views.py @@ -0,0 +1,235 @@ +from django.shortcuts import render +from elasticsearch_dsl import Search, Q +from elasticsearch_dsl.connections import connections +from django.conf import settings +import re + +from django.http import JsonResponse + + +# Configuração da conexão com o Elasticsearch +connections.create_connection(hosts=[settings.ELASTICSEARCH_HOSTS]) + + +def spellcheck_view(request): + query = request.GET.get('q', '') + suggestions = [] + + if query: + s = Search(index='pdf_documents') + s = s.suggest('auto_correct', query, + phrase={ + 'field': 'suggest', + 'size': 3, + 'gram_size': 3, + 'confidence': 2.0, + 'direct_generator': [{ + 'field': 'suggest', + 'suggest_mode': 'popular' + }] + }) + response = s.execute() + + if hasattr(response.suggest, 'auto_correct'): + for option in response.suggest.auto_correct[0].options: + suggestions.append(option.text) + + return JsonResponse({'suggestions': suggestions}) + +def search_view(request): + query = request.GET.get('q', '') # Obtém o termo de pesquisa da URL + page = int(request.GET.get('page', 1)) + + results = [] + suggestions = [] + spelling_correction = None + total_hits = 0 + per_page = 10 + + if query: + # Processamento especial para termos entre aspas + exact_phrases = re.findall(r'"([^"]*)"', query) + + # Remove os termos entre aspas da consulta principal + cleaned_query = query + for phrase in exact_phrases: + cleaned_query = cleaned_query.replace(f'"{phrase}"', '') + + # Remove espaços extras e pontuação desnecessária + cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip() + + # Cria uma consulta no Elasticsearch + search = Search(index='pdf_documents') + + # Lista para armazenar todas as consultas + queries = [] + + # Adiciona consulta para termos gerais (com fuzziness para tolerância a erros) + if cleaned_query: + queries.append( + Q('multi_match', + query=cleaned_query, + fields=['title^3', 'content^2', 'synonyms^1'], + fuzziness='AUTO', + boost=2,) + ) + queries.append( + Q('match', + synonyms={ + 'query': cleaned_query, + 'boost': 0.5 + }) + ) + + # Adiciona consultas exatas para frases entre aspas (sem fuzziness) + for phrase in exact_phrases: + if phrase.strip(): + # Consulta de frase exata para o título com peso alto + queries.append( + Q('match_phrase', + title={ + 'query': phrase, + 'boost': 3, + 'slop': 0 # Sem flexibilidade na ordem das palavras + }) + ) + + # Consulta de frase exata para o conteúdo com peso médio + queries.append( + Q('match_phrase', + content={ + 'query': phrase, + 'boost': 2, + 'slop': 0 # Sem flexibilidade na ordem das palavras + }) + ) + + # Combina as consultas com OR (se houver alguma) + if queries: + search = search.query( + Q('bool', should=queries, minimum_should_match=1) + ) + + # Configuração do highlight para mostrar mais contexto + search = search.highlight('content', fragment_size=300, number_of_fragments=2, pre_tags=[''], post_tags=['']) + search = search.highlight('title', fragment_size=300, number_of_fragments=1, pre_tags=[''], post_tags=['']) + + # Paginação + search = search[(page-1)*per_page:page*per_page] + + # Executa a consulta + response = search.execute() + total_hits = response.hits.total.value + + # Processa os resultados + for hit in response: + # Extrai o conteúdo destacado ou usa o original + if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'content'): + highlighted_content = ' ... '.join(hit.meta.highlight.content) + else: + # Se não houver highlight, pegue os primeiros 300 caracteres + highlighted_content = hit.content[:300] + '...' if len(hit.content) > 300 else hit.content + + # Extrai o título destacado ou usa o original + if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'): + highlighted_title = hit.meta.highlight.title[0] + else: + highlighted_title = hit.title + + # Verifica se o resultado corresponde a uma frase exata + is_exact_match = any(phrase.lower() in hit.content.lower() or + phrase.lower() in hit.title.lower() + for phrase in exact_phrases) + + results.append({ + 'id': hit.meta.id, + 'title': hit.title, + 'highlighted_title': highlighted_title, + 'highlighted_content': highlighted_content, + 'uploaded_at': hit.uploaded_at, + 'score': hit.meta.score, + 'is_exact_match': is_exact_match + }) + + # Sugestões "Você quis dizer" (apenas para termos fora de aspas) + if total_hits < 5 and cleaned_query: + suggestion_search = Search(index='pdf_documents') + suggestion_search = suggestion_search.suggest( + 'term_suggestion', + cleaned_query, + term={ + 'field': 'content', + 'suggest_mode': 'popular', + 'size': 5 + } + ) + suggestion_response = suggestion_search.execute() + + if hasattr(suggestion_response, 'suggest') and hasattr(suggestion_response.suggest, 'term_suggestion'): + for suggestion in suggestion_response.suggest.term_suggestion: + for option in suggestion.options: + suggestions.append(option.text) + + # Cria uma correção ortográfica se necessário + if suggestions and total_hits == 0: + corrected_query = cleaned_query + for suggestion_term in suggestion_response.suggest.term_suggestion: + if suggestion_term.options: + # Substitui palavras incorretas por sugestões + word_to_replace = suggestion_term.text + corrected_word = suggestion_term.options[0].text + corrected_query = re.sub(r'\b' + re.escape(word_to_replace) + r'\b', + corrected_word, + corrected_query, + flags=re.IGNORECASE) + + # Reconstrói a consulta original mantendo as frases entre aspas + if corrected_query != cleaned_query: + spelling_correction = corrected_query + for phrase in exact_phrases: + spelling_correction += f' "{phrase}"' + spelling_correction = spelling_correction.strip() + + # Busca por termos relacionados (apenas se houver poucos resultados) + if total_hits < 3 and cleaned_query: + related_terms = Search(index='pdf_documents') + related_terms = related_terms.query( + 'more_like_this', + fields=['content', 'title'], + like=cleaned_query, + min_term_freq=1, + max_query_terms=10, + min_doc_freq=1 + ) + related_terms = related_terms[:5] + related_response = related_terms.execute() + + for hit in related_response: + # Verifica se este documento já está nos resultados + if not any(r.get('id') == hit.meta.id for r in results): + results.append({ + 'id': hit.meta.id, + 'title': hit.title, + 'highlighted_title': hit.title, + 'highlighted_content': hit.content[:300] + '...' if len(hit.content) > 300 else hit.content, + 'uploaded_at': hit.uploaded_at, + 'score': hit.meta.score, + 'is_related': True + }) + + # Calcula a paginação + total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0 + + # Renderiza o template com os resultados + return render(request, 'diarios/search_results.html', { + 'query': query, + 'results': results, + 'suggestions': suggestions[:5], # Limita a 5 sugestões + 'spelling_correction': spelling_correction, + 'total_hits': total_hits, + 'page': page, + 'total_pages': total_pages, + 'page_range': range(max(1, page-2), min(total_pages+1, page+3)), + 'has_exact_phrases': bool(exact_phrases) + }) + diff --git a/docker-compose.local.yml b/docker-compose.local.yml index 1af9f40..81f388b 100644 --- a/docker-compose.local.yml +++ b/docker-compose.local.yml @@ -1,9 +1,18 @@ volumes: diarios_oficiais_alems_local_postgres_data: {} diarios_oficiais_alems_local_postgres_data_backups: {} - + esdata: services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:7.10.0 + environment: + - discovery.type=single-node + ports: + - "9200:9200" + volumes: + - esdata:/usr/share/elasticsearch/data + django: build: context: . @@ -18,7 +27,7 @@ services: - ./.envs/.local/.django - ./.envs/.local/.postgres ports: - - '8000:8000' + - '8005:8005' command: /start postgres: @@ -32,3 +41,17 @@ services: - diarios_oficiais_alems_local_postgres_data_backups:/backups env_file: - ./.envs/.local/.postgres + + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.17.3 + environment: + - discovery.type=single-node + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + ports: + - "9200:9200" + volumes: + - esdata:/usr/share/elasticsearch/data + diff --git a/requirements/base.txt b/requirements/base.txt index 38cdb52..ce299c6 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -16,3 +16,7 @@ django-crispy-forms==2.3 # https://github.com/django-crispy-forms/django-crispy crispy-bootstrap5==2024.10 # https://github.com/django-crispy-forms/crispy-bootstrap5 django-compressor==4.5.1 # https://github.com/django-compressor/django-compressor django-redis==5.4.0 # https://github.com/jazzband/django-redis + +elasticsearch +django-elasticsearch-dsl +PyPDF2 diff --git a/synonyms.txt b/synonyms.txt new file mode 100644 index 0000000..e69de29