diff --git a/compose/local/django/start b/compose/local/django/start
index ba96db4..bc81e2a 100644
--- a/compose/local/django/start
+++ b/compose/local/django/start
@@ -6,4 +6,4 @@ set -o nounset
python manage.py migrate
-exec python manage.py runserver_plus 0.0.0.0:8000
+exec python manage.py runserver_plus 0.0.0.0:8005
diff --git a/config/settings/base.py b/config/settings/base.py
index 8c283c6..68af005 100644
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -82,7 +82,8 @@ THIRD_PARTY_APPS = [
LOCAL_APPS = [
"diários_oficiais_alems.users",
- # Your stuff: custom apps go here
+ "diarios",
+ 'django_elasticsearch_dsl',
]
# https://docs.djangoproject.com/en/dev/ref/settings/#installed-apps
INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS
@@ -287,7 +288,8 @@ STATICFILES_FINDERS += ["compressor.finders.CompressorFinder"]
# Elastic Search
# ------------------------------------------------------------------------------
ELASTICSEARCH_DSL = {
- 'default': {
- 'hosts': env('ELASTICSEARCH_HOSTS', default='localhost:9200')
- },
-}
\ No newline at end of file
+ 'default': {
+ 'hosts': 'http://elasticsearch:9200' # same as above
+ },
+}
+ELASTICSEARCH_HOSTS="http://elasticsearch:9200"
diff --git a/config/settings/local.py b/config/settings/local.py
index 28700a8..1787e95 100644
--- a/config/settings/local.py
+++ b/config/settings/local.py
@@ -14,7 +14,7 @@ SECRET_KEY = env(
default="tYdYl0MP5zgpMlMmjBuYHvH4Dp3JDN5q3sxWBdFejemZSr0qpI9IrvrvTm17F0aW",
)
# https://docs.djangoproject.com/en/dev/ref/settings/#allowed-hosts
-ALLOWED_HOSTS = ["localhost", "0.0.0.0", "127.0.0.1"] # noqa: S104
+ALLOWED_HOSTS = ["localhost", "0.0.0.0", "127.0.0.1", "109.199.98.226"] # noqa: S104
# CACHES
# ------------------------------------------------------------------------------
diff --git a/config/urls.py b/config/urls.py
index b7adf62..c0786ff 100644
--- a/config/urls.py
+++ b/config/urls.py
@@ -20,7 +20,7 @@ urlpatterns = [
path("users/", include("diários_oficiais_alems.users.urls", namespace="users")),
path("accounts/", include("allauth.urls")),
# Your stuff: custom urls includes go here
- # ...
+ path("diarios/", include("diarios.urls")),
# Media files
*static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT),
]
diff --git a/diarios/__init__.py b/diarios/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/diarios/admin.py b/diarios/admin.py
new file mode 100644
index 0000000..4641b0b
--- /dev/null
+++ b/diarios/admin.py
@@ -0,0 +1,8 @@
+from django.contrib import admin
+from .models import PDFDocument
+from django.db import models
+
+
+@admin.register(PDFDocument)
+class PDFDocumentAdmin(admin.ModelAdmin):
+ pass
diff --git a/diarios/apps.py b/diarios/apps.py
new file mode 100644
index 0000000..55164d7
--- /dev/null
+++ b/diarios/apps.py
@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class DiariosConfig(AppConfig):
+ default_auto_field = 'django.db.models.BigAutoField'
+ name = 'diarios'
diff --git a/diarios/documents.py b/diarios/documents.py
new file mode 100644
index 0000000..35bff2d
--- /dev/null
+++ b/diarios/documents.py
@@ -0,0 +1,68 @@
+from django_elasticsearch_dsl import Document, fields
+from django_elasticsearch_dsl.registries import registry
+from .models import PDFDocument
+
+@registry.register_document
+class PDFDocumentDocument(Document):
+ title = fields.TextField()
+ content = fields.TextField(analyzer='portuguese')
+
+ class Index:
+ name = 'pdf_documents'
+ settings = {
+ 'number_of_shards': 1,
+ 'number_of_replicas': 0,
+ 'analysis': {
+ 'analyzer': {
+ 'portuguese': {
+ 'type': 'custom',
+ 'tokenizer': 'standard',
+ 'filter': [
+ 'lowercase',
+ 'ascii_folding',
+ 'portuguese_stemmer',
+ 'stop',
+ 'portuguese_synonyms',
+ ]
+ },
+ 'portuguese_search': {
+ 'type': 'custom',
+ 'tokenizer': 'standard',
+ 'filter': [
+ 'lowercase',
+ 'ascii_folding',
+ 'portuguese_stemmer',
+ 'stop',
+ 'suggest_shingle',
+ ]
+ }
+ },
+ 'filter': {
+ 'suggest_shingle': {
+ 'type': 'shingle',
+ 'min_shingle_size': 2,
+ 'max_shingle_size': 3
+ },
+ 'stop': {
+ 'type': 'stop',
+ 'stopwords': '_portuguese_'
+ },
+ 'ascii_folding': {
+ 'type': 'asciifolding'
+ },
+ 'portuguese_stemmer': {
+ 'type': 'stemmer',
+ 'language': 'portuguese'
+ },
+ 'portuguese_synonyms':{
+ 'type': 'synonym',
+ 'synonyms_path': 'synonyms.txt',
+ 'expand': True
+ }
+ }
+ }
+ }
+
+ class Django:
+ model = PDFDocument
+ fields = ['uploaded_at']
diff --git a/diarios/migrations/0001_initial.py b/diarios/migrations/0001_initial.py
new file mode 100644
index 0000000..ec3516f
--- /dev/null
+++ b/diarios/migrations/0001_initial.py
@@ -0,0 +1,24 @@
+# Generated by Django 5.0.12 on 2025-03-06 16:00
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ initial = True
+
+ dependencies = [
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='PDFDocument',
+ fields=[
+ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('title', models.CharField(max_length=255)),
+ ('file', models.FileField(upload_to='pdfs/')),
+ ('content', models.TextField(blank=True)),
+ ('uploaded_at', models.DateTimeField(auto_now_add=True)),
+ ],
+ ),
+ ]
diff --git a/diarios/migrations/__init__.py b/diarios/migrations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/diarios/models.py b/diarios/models.py
new file mode 100644
index 0000000..d5391b9
--- /dev/null
+++ b/diarios/models.py
@@ -0,0 +1,23 @@
+from django.db import models
+import PyPDF2
+
+
+class PDFDocument(models.Model):
+ title = models.CharField(max_length=255)
+ file = models.FileField(upload_to='pdfs/')
+ content = models.TextField(blank=True)
+ uploaded_at = models.DateTimeField(auto_now_add=True)
+
+ def __str__(self):
+ return self.title
+
+ def save(self, *args, **kwargs):
+ if self.file:
+ pdf = PyPDF2.PdfReader(self.file)
+ texto = []
+ for pagina in pdf.pages:
+ texto.append(pagina.extract_text())
+ self.content = '\n'.join(texto)
+
+ super().save(*args, **kwargs)
+
diff --git a/diarios/signals.py b/diarios/signals.py
new file mode 100644
index 0000000..bceff3c
--- /dev/null
+++ b/diarios/signals.py
@@ -0,0 +1,18 @@
+from django.db.models.signals import post_save
+from django.dispatch import receiver
+import PyPDF2
+from io import BytesIO
+from .models import PDFDocument
+
+
+@receiver(post_save, sender=PDFDocument)
+def extract_text(sender, instance, created, **kwargs):
+ print("Signal disparado!") # Teste se o Signal está funcionando
+ if created and instance.file:
+ pdf = PyPDF2.PdfReader(instance.file)
+ text = []
+ for page in pdf.pages:
+ text.append(page.extract_text())
+ instance.content = '\n'.join(text)
+ instance.save(update_fields=['content'])
+
diff --git a/diarios/templates/diarios/search_results.html b/diarios/templates/diarios/search_results.html
new file mode 100644
index 0000000..86819f0
--- /dev/null
+++ b/diarios/templates/diarios/search_results.html
@@ -0,0 +1,310 @@
+
+
+
+
+
+ {% if query %}{{ query }} - {% endif %}Pesquisa de Documentos
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% if query %}
+
+
+ {% if total_hits > 0 %}
+
Cerca de {{ total_hits }} resultados encontrados para "{{ query }}"
+ {% else %}
+
Nenhum resultado encontrado para "{{ query }}"
+ {% endif %}
+
+
+
+ {% if spelling_correction %}
+
+ {% endif %}
+
+
+ {% if suggestions %}
+
+
Talvez você esteja procurando por:
+ {% for suggestion in suggestions %}
+ {{ suggestion }}{% if not forloop.last %}, {% endif %}
+ {% endfor %}
+
+
+ {% endif %}
+
+
+ {% if results %}
+
+ {% for result in results %}
+
+
+ {% if result.is_exact_match %}
+ Correspondência exata
+ {% endif %}
+ {% if result.is_related %}
+ Termo relacionado
+ {% endif %}
+
+
+
{{ result.highlighted_content|safe }}
+
+ {{ result.uploaded_at|date:"d/m/Y" }}
+
+
+ {% endfor %}
+
+
+
+ {% if total_pages > 1 %}
+
+ {% endif %}
+ {% else %}
+
+
+ Nenhum documento corresponde aos termos de pesquisa. Tente usar palavras-chave diferentes ou mais gerais.
+ {% if has_exact_phrases %}
+
Você pesquisou por frases exatas. Tente remover as aspas para uma busca mais ampla.
+ {% endif %}
+
+ {% endif %}
+ {% else %}
+
+
+
BuscaDocs
+
+
+
Pesquise em nossa biblioteca de documentos digitalizados
+
+
Dicas de pesquisa:
+
+ - Use aspas duplas para buscar frases exatas:
"documento oficial"
+ - Tente usar sinônimos se não encontrar resultados
+ - Seja específico para encontrar documentos relevantes
+
+
+
+
+ {% endif %}
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/diarios/tests.py b/diarios/tests.py
new file mode 100644
index 0000000..7ce503c
--- /dev/null
+++ b/diarios/tests.py
@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.
diff --git a/diarios/urls.py b/diarios/urls.py
new file mode 100644
index 0000000..0f0075f
--- /dev/null
+++ b/diarios/urls.py
@@ -0,0 +1,7 @@
+from django.urls import path
+from .views import search_view, spellcheck_view
+
+urlpatterns = [
+ path('pesquisa/', search_view, name='search_view'),
+ path('spellcheck/', spellcheck_view, name='spellcheck_view'),
+]
diff --git a/diarios/views.py b/diarios/views.py
new file mode 100644
index 0000000..49ef3e3
--- /dev/null
+++ b/diarios/views.py
@@ -0,0 +1,235 @@
+from django.shortcuts import render
+from elasticsearch_dsl import Search, Q
+from elasticsearch_dsl.connections import connections
+from django.conf import settings
+import re
+
+from django.http import JsonResponse
+
+
+# Configuração da conexão com o Elasticsearch
+connections.create_connection(hosts=[settings.ELASTICSEARCH_HOSTS])
+
+
+def spellcheck_view(request):
+ query = request.GET.get('q', '')
+ suggestions = []
+
+ if query:
+ s = Search(index='pdf_documents')
+ s = s.suggest('auto_correct', query,
+ phrase={
+ 'field': 'suggest',
+ 'size': 3,
+ 'gram_size': 3,
+ 'confidence': 2.0,
+ 'direct_generator': [{
+ 'field': 'suggest',
+ 'suggest_mode': 'popular'
+ }]
+ })
+ response = s.execute()
+
+ if hasattr(response.suggest, 'auto_correct'):
+ for option in response.suggest.auto_correct[0].options:
+ suggestions.append(option.text)
+
+ return JsonResponse({'suggestions': suggestions})
+
+def search_view(request):
+ query = request.GET.get('q', '') # Obtém o termo de pesquisa da URL
+ page = int(request.GET.get('page', 1))
+
+ results = []
+ suggestions = []
+ spelling_correction = None
+ total_hits = 0
+ per_page = 10
+
+ if query:
+ # Processamento especial para termos entre aspas
+ exact_phrases = re.findall(r'"([^"]*)"', query)
+
+ # Remove os termos entre aspas da consulta principal
+ cleaned_query = query
+ for phrase in exact_phrases:
+ cleaned_query = cleaned_query.replace(f'"{phrase}"', '')
+
+ # Remove espaços extras e pontuação desnecessária
+ cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
+
+ # Cria uma consulta no Elasticsearch
+ search = Search(index='pdf_documents')
+
+ # Lista para armazenar todas as consultas
+ queries = []
+
+ # Adiciona consulta para termos gerais (com fuzziness para tolerância a erros)
+ if cleaned_query:
+ queries.append(
+ Q('multi_match',
+ query=cleaned_query,
+ fields=['title^3', 'content^2', 'synonyms^1'],
+ fuzziness='AUTO',
+ boost=2,)
+ )
+ queries.append(
+ Q('match',
+ synonyms={
+ 'query': cleaned_query,
+ 'boost': 0.5
+ })
+ )
+
+ # Adiciona consultas exatas para frases entre aspas (sem fuzziness)
+ for phrase in exact_phrases:
+ if phrase.strip():
+ # Consulta de frase exata para o título com peso alto
+ queries.append(
+ Q('match_phrase',
+ title={
+ 'query': phrase,
+ 'boost': 3,
+ 'slop': 0 # Sem flexibilidade na ordem das palavras
+ })
+ )
+
+ # Consulta de frase exata para o conteúdo com peso médio
+ queries.append(
+ Q('match_phrase',
+ content={
+ 'query': phrase,
+ 'boost': 2,
+ 'slop': 0 # Sem flexibilidade na ordem das palavras
+ })
+ )
+
+ # Combina as consultas com OR (se houver alguma)
+ if queries:
+ search = search.query(
+ Q('bool', should=queries, minimum_should_match=1)
+ )
+
+ # Configuração do highlight para mostrar mais contexto
+ search = search.highlight('content', fragment_size=300, number_of_fragments=2, pre_tags=[''], post_tags=[''])
+ search = search.highlight('title', fragment_size=300, number_of_fragments=1, pre_tags=[''], post_tags=[''])
+
+ # Paginação
+ search = search[(page-1)*per_page:page*per_page]
+
+ # Executa a consulta
+ response = search.execute()
+ total_hits = response.hits.total.value
+
+ # Processa os resultados
+ for hit in response:
+ # Extrai o conteúdo destacado ou usa o original
+ if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'content'):
+ highlighted_content = ' ... '.join(hit.meta.highlight.content)
+ else:
+ # Se não houver highlight, pegue os primeiros 300 caracteres
+ highlighted_content = hit.content[:300] + '...' if len(hit.content) > 300 else hit.content
+
+ # Extrai o título destacado ou usa o original
+ if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'):
+ highlighted_title = hit.meta.highlight.title[0]
+ else:
+ highlighted_title = hit.title
+
+ # Verifica se o resultado corresponde a uma frase exata
+ is_exact_match = any(phrase.lower() in hit.content.lower() or
+ phrase.lower() in hit.title.lower()
+ for phrase in exact_phrases)
+
+ results.append({
+ 'id': hit.meta.id,
+ 'title': hit.title,
+ 'highlighted_title': highlighted_title,
+ 'highlighted_content': highlighted_content,
+ 'uploaded_at': hit.uploaded_at,
+ 'score': hit.meta.score,
+ 'is_exact_match': is_exact_match
+ })
+
+ # Sugestões "Você quis dizer" (apenas para termos fora de aspas)
+ if total_hits < 5 and cleaned_query:
+ suggestion_search = Search(index='pdf_documents')
+ suggestion_search = suggestion_search.suggest(
+ 'term_suggestion',
+ cleaned_query,
+ term={
+ 'field': 'content',
+ 'suggest_mode': 'popular',
+ 'size': 5
+ }
+ )
+ suggestion_response = suggestion_search.execute()
+
+ if hasattr(suggestion_response, 'suggest') and hasattr(suggestion_response.suggest, 'term_suggestion'):
+ for suggestion in suggestion_response.suggest.term_suggestion:
+ for option in suggestion.options:
+ suggestions.append(option.text)
+
+ # Cria uma correção ortográfica se necessário
+ if suggestions and total_hits == 0:
+ corrected_query = cleaned_query
+ for suggestion_term in suggestion_response.suggest.term_suggestion:
+ if suggestion_term.options:
+ # Substitui palavras incorretas por sugestões
+ word_to_replace = suggestion_term.text
+ corrected_word = suggestion_term.options[0].text
+ corrected_query = re.sub(r'\b' + re.escape(word_to_replace) + r'\b',
+ corrected_word,
+ corrected_query,
+ flags=re.IGNORECASE)
+
+ # Reconstrói a consulta original mantendo as frases entre aspas
+ if corrected_query != cleaned_query:
+ spelling_correction = corrected_query
+ for phrase in exact_phrases:
+ spelling_correction += f' "{phrase}"'
+ spelling_correction = spelling_correction.strip()
+
+ # Busca por termos relacionados (apenas se houver poucos resultados)
+ if total_hits < 3 and cleaned_query:
+ related_terms = Search(index='pdf_documents')
+ related_terms = related_terms.query(
+ 'more_like_this',
+ fields=['content', 'title'],
+ like=cleaned_query,
+ min_term_freq=1,
+ max_query_terms=10,
+ min_doc_freq=1
+ )
+ related_terms = related_terms[:5]
+ related_response = related_terms.execute()
+
+ for hit in related_response:
+ # Verifica se este documento já está nos resultados
+ if not any(r.get('id') == hit.meta.id for r in results):
+ results.append({
+ 'id': hit.meta.id,
+ 'title': hit.title,
+ 'highlighted_title': hit.title,
+ 'highlighted_content': hit.content[:300] + '...' if len(hit.content) > 300 else hit.content,
+ 'uploaded_at': hit.uploaded_at,
+ 'score': hit.meta.score,
+ 'is_related': True
+ })
+
+ # Calcula a paginação
+ total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0
+
+ # Renderiza o template com os resultados
+ return render(request, 'diarios/search_results.html', {
+ 'query': query,
+ 'results': results,
+ 'suggestions': suggestions[:5], # Limita a 5 sugestões
+ 'spelling_correction': spelling_correction,
+ 'total_hits': total_hits,
+ 'page': page,
+ 'total_pages': total_pages,
+ 'page_range': range(max(1, page-2), min(total_pages+1, page+3)),
+ 'has_exact_phrases': bool(exact_phrases)
+ })
+
diff --git a/docker-compose.local.yml b/docker-compose.local.yml
index 1af9f40..81f388b 100644
--- a/docker-compose.local.yml
+++ b/docker-compose.local.yml
@@ -1,9 +1,18 @@
volumes:
diarios_oficiais_alems_local_postgres_data: {}
diarios_oficiais_alems_local_postgres_data_backups: {}
-
+ esdata:
services:
+ elasticsearch:
+ image: docker.elastic.co/elasticsearch/elasticsearch:7.10.0
+ environment:
+ - discovery.type=single-node
+ ports:
+ - "9200:9200"
+ volumes:
+ - esdata:/usr/share/elasticsearch/data
+
django:
build:
context: .
@@ -18,7 +27,7 @@ services:
- ./.envs/.local/.django
- ./.envs/.local/.postgres
ports:
- - '8000:8000'
+ - '8005:8005'
command: /start
postgres:
@@ -32,3 +41,17 @@ services:
- diarios_oficiais_alems_local_postgres_data_backups:/backups
env_file:
- ./.envs/.local/.postgres
+
+ elasticsearch:
+ image: docker.elastic.co/elasticsearch/elasticsearch:8.17.3
+ environment:
+ - discovery.type=single-node
+ environment:
+ - discovery.type=single-node
+ - xpack.security.enabled=false
+ - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
+ ports:
+ - "9200:9200"
+ volumes:
+ - esdata:/usr/share/elasticsearch/data
+
diff --git a/requirements/base.txt b/requirements/base.txt
index 38cdb52..ce299c6 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -16,3 +16,7 @@ django-crispy-forms==2.3 # https://github.com/django-crispy-forms/django-crispy
crispy-bootstrap5==2024.10 # https://github.com/django-crispy-forms/crispy-bootstrap5
django-compressor==4.5.1 # https://github.com/django-compressor/django-compressor
django-redis==5.4.0 # https://github.com/jazzband/django-redis
+
+elasticsearch
+django-elasticsearch-dsl
+PyPDF2
diff --git a/synonyms.txt b/synonyms.txt
new file mode 100644
index 0000000..e69de29