arruma o processo de busca textual nos diarios

This commit is contained in:
root
2025-03-14 17:36:14 +01:00
parent 8d1f6feeaf
commit f2e5cd73b7
15 changed files with 650 additions and 645 deletions

View File

@ -84,6 +84,7 @@ LOCAL_APPS = [
"diários_oficiais_alems.users", "diários_oficiais_alems.users",
"diarios", "diarios",
"django_elasticsearch_dsl", "django_elasticsearch_dsl",
"rest_framework",
] ]
# https://docs.djangoproject.com/en/dev/ref/settings/#installed-apps # https://docs.djangoproject.com/en/dev/ref/settings/#installed-apps
INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS
@ -293,3 +294,36 @@ ELASTICSEARCH_DSL = {
"default": {"hosts": "http://elasticsearch:9200"}, # same as above "default": {"hosts": "http://elasticsearch:9200"}, # same as above
} }
ELASTICSEARCH_HOSTS = "http://elasticsearch:9200" ELASTICSEARCH_HOSTS = "http://elasticsearch:9200"
ELASTICSEARCH_INDEX_SETTINGS = {
'number_of_shards': 1,
'number_of_replicas': 0,
'analysis': {
'filter': {
'portuguese_stop': {
'type': 'stop',
'stopwords': '_portuguese_'
},
'portuguese_stemmer': {
'type': 'stemmer',
'language': 'portuguese'
},
'synonym_filter': {
'type': 'synonym',
'synonyms_path': 'analysis/sinonimos.txt',
}
},
'analyzer': {
'pt_analyzer': {
'tokenizer': 'standard',
'filter': [
'lowercase',
'portuguese_stop',
'portuguese_stemmer',
'synonym_filter'
]
}
}
}
}

View File

@ -1,8 +1,18 @@
from django.contrib import admin from django.contrib import admin
from .models import PDFDocument from .models import PDFDocument, DiarioOficial, TipoDiarioOficial
from django.db import models from django.db import models
@admin.register(PDFDocument) @admin.register(PDFDocument)
class PDFDocumentAdmin(admin.ModelAdmin): class PDFDocumentAdmin(admin.ModelAdmin):
pass pass
@admin.register(DiarioOficial)
class DiarioOficialAdmin(admin.ModelAdmin):
pass
@admin.register(TipoDiarioOficial)
class TipoDiarioOficialAdmin(admin.ModelAdmin):
pass

View File

@ -0,0 +1,8 @@
from django import template
register = template.Library()
@register.filter
def get_range(value):
return range(value)

View File

@ -1,157 +1,91 @@
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from .models import PDFDocument
@registry.register_document
class PDFDocumentDocument(Document):
title = fields.TextField()
content = fields.TextField(analyzer="portuguese")
pages = fields.NestedField(
properties={
"number": fields.IntegerField(),
"content": fields.TextField(analyzer="portuguese"),
}
)
class Index:
name = "pdf_documents"
settings = {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"portuguese": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"portuguese_synonyms",
],
},
"portuguese_search": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"suggest_shingle",
],
},
},
"filter": {
"suggest_shingle": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3,
},
"stop": {"type": "stop", "stopwords": "_portuguese_"},
"ascii_folding": {"type": "asciifolding"},
"portuguese_stemmer": {"type": "stemmer", "language": "portuguese"},
"portuguese_synonyms": {
"type": "synonym",
"synonyms_path": "synonyms.txt",
"expand": True,
},
},
},
}
class Django:
model = PDFDocument
fields = ["uploaded_at", "file"]
from django_elasticsearch_dsl import Document, fields from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry from django_elasticsearch_dsl.registries import registry
from .models import DiarioOficial from .models import DiarioOficial
@registry.register_document @registry.register_document
class DiarioOficialDocument(Document): class DiarioOficialDocument(Document):
# Campos principais tipo = fields.ObjectField(properties={
title = fields.TextField() 'nome': fields.TextField()
tipo = fields.KeywordField() })
# Campo para arquivo PDF (se aplicável) numero = fields.TextField()
arquivo = fields.TextField(attr="arquivo.url") data = fields.DateField()
link = fields.TextField()
# Nested field para páginas (usando o page_content)
pages = fields.NestedField( # Campo para armazenar todas as páginas para busca
properties={ content = fields.TextField(
"number": fields.IntegerField(), analyzer='pt_analyzer',
"content": fields.TextField(analyzer="portuguese")
}
) )
# Campo para armazenar páginas individualmente
pages = fields.NestedField(properties={
'number': fields.IntegerField(),
'content': fields.TextField(
analyzer='pt_analyzer',
)
})
class Index: class Index:
name = "diarios_oficiais" name = 'diarios_oficiais'
settings = { settings = {
"number_of_shards": 1, 'number_of_shards': 1,
"number_of_replicas": 0, 'number_of_replicas': 0,
"analysis": { 'analysis': {
"analyzer": { 'filter': {
"portuguese": { 'portuguese_stop': {
"type": "custom", 'type': 'stop',
"tokenizer": "standard", 'stopwords': '_portuguese_'
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"portuguese_synonyms",
]
}, },
"portuguese_search": { 'portuguese_stemmer': {
"type": "custom", 'type': 'stemmer',
"tokenizer": "standard", 'language': 'portuguese'
"filter": [ },
"lowercase", 'synonym_filter': {
"ascii_folding", 'type': 'synonym',
"portuguese_stemmer", 'synonyms': [
"stop", 'lei, legislação, norma',
"suggest_shingle", 'processo, procedimento, autos',
'contrato, acordo, convênio',
# Adicione mais sinônimos relevantes para o contexto legal
] ]
} }
}, },
"filter": { 'analyzer': {
"suggest_shingle": { 'pt_analyzer': {
"type": "shingle", 'tokenizer': 'standard',
"min_shingle_size": 2, 'filter': [
"max_shingle_size": 3 'lowercase',
}, 'portuguese_stop',
"stop": {"type": "stop", "stopwords": "_portuguese_"}, 'portuguese_stemmer',
"ascii_folding": {"type": "asciifolding"}, 'synonym_filter'
"portuguese_stemmer": {"type": "stemmer", "language": "portuguese"}, ]
"portuguese_synonyms": {
"type": "synonym",
"synonyms_path": "synonyms.txt",
"expand": True
} }
} }
} }
} }
class Django: class Django:
model = DiarioOficial model = DiarioOficial
fields = [ fields = [
"data", 'id'
"numero",
"link",
] ]
def prepare_tipo(self, instance): def prepare_tipo(self, instance):
return instance.tipo.nome if instance.tipo else None if instance.tipo:
return {
def prepare_title(self, instance): 'nome': instance.tipo.nome
return f"{instance.tipo.nome if instance.tipo else 'Diário'} {instance.numero}" }
return {}
def prepare_pages(self, instance):
# Prepara o campo pages usando o page_content def prepare_content(self, instance):
"""Concatena todo o conteúdo de todas as páginas em um único campo para busca"""
if instance.page_content: if instance.page_content:
return instance.page_content # page_content já é uma lista de dicionários return " ".join([page.get('content', '') for page in instance.page_content])
return ""
def prepare_pages(self, instance):
"""Prepara o campo de páginas individuais para exibição e destaque"""
if instance.page_content:
return instance.page_content
return [] return []

View File

@ -0,0 +1,24 @@
from django.core.management.base import BaseCommand
from django_elasticsearch_dsl.registries import registry
class Command(BaseCommand):
help = 'Reindexar todos os Diários Oficiais no Elasticsearch'
def handle(self, *args, **options):
self.stdout.write('Iniciando reindexação...')
# Recria os índices
registry.delete_indices()
registry.create_indices()
# Reindexar documentos
for index in registry.get_indices():
self.stdout.write(f'Reindexando {index}...')
documents = []
for doc in registry.get_documents():
if index == doc._index._name:
self.stdout.write(f' + {doc.__name__}')
doc().update()
self.stdout.write(self.style.SUCCESS('Reindexação concluída!'))

View File

@ -1,7 +1,12 @@
import requests
import os
from urllib.parse import urlparse
from django.core.files.base import ContentFile
from django.db import models from django.db import models
import PyPDF2 import PyPDF2
import json import json
from django.core.serializers.json import DjangoJSONEncoder from django.core.serializers.json import DjangoJSONEncoder
from babel.dates import format_date
class PDFDocument(models.Model): class PDFDocument(models.Model):
@ -34,6 +39,7 @@ class PDFDocument(models.Model):
super().save(*args, **kwargs) super().save(*args, **kwargs)
class TipoDiarioOficial(models.Model): class TipoDiarioOficial(models.Model):
nome = models.CharField(max_length=100, unique=True) nome = models.CharField(max_length=100, unique=True)
@ -57,24 +63,51 @@ class DiarioOficial(models.Model):
numero = models.CharField(max_length=20, unique=True) numero = models.CharField(max_length=20, unique=True)
link = models.URLField(blank=True, null=True, unique=True) link = models.URLField(blank=True, null=True, unique=True)
page_content = models.JSONField(encoder=DjangoJSONEncoder, blank=True, null=True) page_content = models.JSONField(encoder=DjangoJSONEncoder, blank=True, null=True)
def save(self, *args, **kwargs): def save(self, *args, **kwargs):
if self.file: # Se houver um link, baixa o PDF e extrai o conteúdo
pdf = PyPDF2.PdfReader(self.file) if self.link and not self.arquivo:
pages_data = [] try:
# Faz o download do PDF
response = requests.get(self.link)
response.raise_for_status() # Verifica se o download foi bem-sucedido
for i, pagina in enumerate(pdf.pages): # Define o nome do arquivo a partir do link
page_text = pagina.extract_text() parsed_url = urlparse(self.link)
pages_data.append( file_name = (
{ os.path.basename(parsed_url.path) or f"diario_{self.numero}.pdf"
"number": i + 1,
"content": page_text,
}
) )
self.page_content = json.dumps(pages_data)
# Salva o arquivo no campo `arquivo`
self.arquivo.save(file_name, ContentFile(response.content), save=False)
# Extrai o conteúdo do PDF
pdf = PyPDF2.PdfReader(self.arquivo)
pages_data = []
for i, pagina in enumerate(pdf.pages):
page_text = pagina.extract_text()
if page_text: # Ignora páginas sem conteúdo
pages_data.append(
{
"number": i + 1,
"content": page_text,
}
)
# Salva o conteúdo das páginas no campo `page_content`
self.page_content = pages_data
except requests.RequestException as e:
print(f"Erro ao baixar o PDF: {e}")
except PyPDF2.PdfReadError as e:
print(f"Erro ao ler o PDF: {e}")
except Exception as e:
print(f"Erro inesperado: {e}")
# Salva o modelo
super().save(*args, **kwargs) super().save(*args, **kwargs)
@property @property
def data_formatada(self): def data_formatada(self):
return format_date(self.data, format="long", locale="pt_BR") return format_date(self.data, format="long", locale="pt_BR")
@ -89,4 +122,3 @@ class DiarioOficial(models.Model):
class Meta: class Meta:
constraints = [models.UniqueConstraint(fields=["numero"], name="unique_numero")] constraints = [models.UniqueConstraint(fields=["numero"], name="unique_numero")]
verbose_name_plural = "Diários Oficiais" verbose_name_plural = "Diários Oficiais"

67
diarios/search_service.py Normal file
View File

@ -0,0 +1,67 @@
from elasticsearch_dsl import Q, Search
from .documents import DiarioOficialDocument
class DiarioOficialSearchService:
@staticmethod
def search(query, highlight=True, fuzziness=1, page=1, page_size=10, tipos=None, data_inicio=None, data_fim=None):
# Configura a busca básica
s = DiarioOficialDocument.search().source(excludes=['page_content.content'])
# Filtros
if tipos:
s = s.filter('terms', tipo_nome=tipos)
if data_inicio and data_fim:
s = s.filter('range', data={'gte': data_inicio, 'lte': data_fim})
# Query principal com fuzziness e sinônimos
main_query = Q(
'multi_match',
query=query,
fields=[
'numero^3', # Maior peso para o número
'tipo_nome^2', # Peso médio para o tipo
'page_content.content' # Peso padrão para o conteúdo
],
fuzziness=fuzziness,
analyzer='portuguese_synonyms'
)
s = s.query(main_query)
# Highlighting
if highlight:
s = s.highlight(
'page_content.content',
fragment_size=150,
number_of_fragments=3,
pre_tags=['<mark>'],
post_tags=['</mark>']
)
# Paginação
start = (page - 1) * page_size
end = start + page_size
s = s[start:end]
# Executa a busca
response = s.execute()
# Formata os resultados
results = []
for hit in response:
result = {
'id': hit.id,
'numero': hit.numero,
'data': hit.data,
'link': hit.link,
'tipo_nome': hit.tipo_nome,
'score': hit.meta.score
}
if highlight and hasattr(hit.meta, 'highlight'):
result['highlights'] = hit.meta.highlight['page_content.content'].to_dict()
results.append(result)
return {
'total': response.hits.total.value,
'results': results
}

View File

@ -1,17 +1,15 @@
from django.db.models.signals import post_save from django.db.models.signals import post_save, post_delete
from django.dispatch import receiver from django.dispatch import receiver
import PyPDF2
from io import BytesIO
from .models import PDFDocument
@receiver(post_save, sender=PDFDocument) @receiver(post_save, sender=DiarioOficial)
def extract_text(sender, instance, created, **kwargs): def update_document(sender, instance, **kwargs):
print("Signal disparado!") # Teste se o Signal está funcionando """Atualizar documento no Elasticsearch quando o objeto for salvo"""
if created and instance.file: DiarioOficialDocument.update_document(instance)
pdf = PyPDF2.PdfReader(instance.file)
text = []
for page in pdf.pages: @receiver(post_delete, sender=DiarioOficial)
text.append(page.extract_text()) def delete_document(sender, instance, **kwargs):
instance.content = "\n".join(text) """Deletar documento do Elasticsearch quando o objeto for deletado"""
instance.save(update_fields=["content"]) document = DiarioOficialDocument.get(id=instance.id)
document.delete()

View File

@ -0,0 +1,124 @@
{% extends "base.html" %}
{% block content %}
<div class="container">
<h1>Busca de Diários Oficiais</h1>
<form method="GET" action="{% url 'search_diarios' %}" class="mb-4">
<div class="input-group">
<input type="text" name="q" class="form-control" value="{{ query }}" placeholder="Digite sua busca...">
<button type="submit" class="btn btn-primary">Buscar</button>
</div>
</form>
{% if query %}
<div class="mb-3">
<h2>Resultados para "{{ query }}"</h2>
<p>Encontrados {{ total }} resultados</p>
</div>
{% if results %}
<div class="search-results">
{% for result in results %}
<div class="card mb-3">
<div class="card-header">
<h5>{{ result.tipo }} nº {{ result.numero }}</h5>
<p class="text-muted">Data: {{ result.data }}</p>
</div>
<div class="card-body">
{% if result.highlight %}
<div class="highlight-section mb-3">
<h6>Destaques:</h6>
<div class="highlight-content">{{ result.highlight|safe }}</div>
</div>
{% endif %}
{% if result.highlighted_pages %}
<div class="highlighted-pages">
<h6>Páginas com o termo buscado:</h6>
<div class="accordion" id="pagesAccordion{{ result.id }}">
{% for page in result.highlighted_pages %}
<div class="accordion-item">
<h2 class="accordion-header">
<button class="accordion-button collapsed" type="button" data-bs-toggle="collapse"
data-bs-target="#page{{ result.id }}_{{ page.number }}">
Página {{ page.number }}
</button>
</h2>
<div id="page{{ result.id }}_{{ page.number }}" class="accordion-collapse collapse"
data-bs-parent="#pagesAccordion{{ result.id }}">
<div class="accordion-body">
{{ page.content|safe }}
</div>
</div>
</div>
{% endfor %}
</div>
</div>
{% endif %}
<div class="mt-3">
<a href="{{ result.link }}" target="_blank" class="btn btn-sm btn-outline-primary">
Ver Diário Online
</a>
<a href="{% url 'diario_detail' result.id %}" class="btn btn-sm btn-outline-secondary">
Ver Detalhes
</a>
</div>
</div>
</div>
{% endfor %}
</div>
<!-- Paginação -->
{% if total_pages > 1 %}
<nav aria-label="Paginação">
<ul class="pagination">
{% if page > 1 %}
<li class="page-item">
<a class="page-link" href="?q={{ query }}&page={{ page|add:'-1' }}&size={{ size }}">Anterior</a>
</li>
{% endif %}
{% for i in total_pages|ljust:"5" %}
{% if i > 0 and i <= total_pages %}
<li class="page-item {% if i == page %}active{% endif %}">
<a class="page-link" href="?q={{ query }}&page={{ i }}&size={{ size }}">{{ i }}</a>
</li>
{% endif %}
{% endfor %}
{% if page < total_pages %}
<li class="page-item">
<a class="page-link" href="?q={{ query }}&page={{ page|add:'1' }}&size={{ size }}">Próxima</a>
</li>
{% endif %}
</ul>
</nav>
{% endif %}
{% else %}
<div class="alert alert-info">
Nenhum resultado encontrado para a sua busca.
</div>
{% endif %}
{% endif %}
</div>
<style>
.highlight-content em {
background-color: #ffeeba;
font-style: normal;
padding: 2px;
border-radius: 2px;
}
.accordion-body em {
background-color: #ffeeba;
font-style: normal;
padding: 2px;
border-radius: 2px;
}
</style>
{% endblock %}

View File

@ -0,0 +1,173 @@
{% extends "base.html" %}
{% load static %}
{% block title %}Busca de Diários Oficiais{% endblock %}
{% block content %}
<div class="container mt-4">
<h1 class="mb-4">Busca de Diários Oficiais</h1>
<div class="card mb-4">
<div class="card-body">
<form method="get" action="{% url 'diario-search' %}">
<div class="row g-3">
<div class="col-md-9">
<label for="q" class="form-label">Buscar por:</label>
<input type="text" id="q" name="q" value="{{ query }}"
class="form-control"
placeholder="Digite palavras-chave, frases ou utilize operadores AND, OR, NOT">
</div>
<div class="col-md-3 d-flex align-items-end">
<button type="submit" class="btn btn-primary w-100">
<i class="bi bi-search"></i> Buscar
</button>
</div>
</div>
<div class="mt-3">
<a class="btn btn-link p-0" data-bs-toggle="collapse" href="#advancedOptions" role="button">
Opções avançadas
</a>
</div>
<div class="collapse" id="advancedOptions">
<div class="row g-3 mt-2">
<div class="col-md-4">
<label class="form-label">Tipos de Diário:</label>
<div class="border rounded p-2" style="max-height: 200px; overflow-y: auto;">
{% for tipo in tipos_disponiveis %}
<div class="form-check">
<input class="form-check-input" type="checkbox"
id="tipo_{{ tipo.id }}" name="tipos" value="{{ tipo.id }}"
{% if tipos_selecionados and tipo.id|stringformat:"i" in tipos_selecionados %}checked{% endif %}>
<label class="form-check-label" for="tipo_{{ tipo.id }}">
{{ tipo.nome }}
</label>
</div>
{% endfor %}
</div>
</div>
<div class="col-md-4">
<label for="data_inicio" class="form-label">Data Inicial:</label>
<input type="date" id="data_inicio" name="data_inicio"
value="{{ data_inicio }}" class="form-control">
</div>
<div class="col-md-4">
<label for="data_fim" class="form-label">Data Final:</label>
<input type="date" id="data_fim" name="data_fim"
value="{{ data_fim }}" class="form-control">
</div>
<div class="col-md-6">
<label for="fuzziness" class="form-label">Tolerância a erros:</label>
<select id="fuzziness" name="fuzziness" class="form-select">
<option value="0" {% if fuzziness == 0 %}selected{% endif %}>Sem tolerância</option>
<option value="1" {% if fuzziness == 1 %}selected{% endif %}>Baixa tolerância</option>
<option value="2" {% if fuzziness == 2 %}selected{% endif %}>Alta tolerância</option>
</select>
</div>
<div class="col-md-6 d-flex align-items-end">
<div class="form-check">
<input class="form-check-input" type="checkbox" id="highlight"
name="highlight" value="true" {% if highlight %}checked{% endif %}>
<label class="form-check-label" for="highlight">
Destacar termos encontrados
</label>
</div>
</div>
</div>
</div>
</form>
</div>
</div>
{% if query %}
<div class="mt-4">
<div class="d-flex justify-content-between align-items-center mb-4">
<h2>Resultados da busca</h2>
<span class="badge bg-primary">{{ total }} resultado(s)</span>
</div>
{% if results %}
{% for result in results %}
<div class="card mb-3">
<div class="card-body">
<h5 class="card-title">
<a href="{% url 'diario-detail' result.id %}?q={{ query|urlencode }}"
class="text-decoration-none">
{{ result.tipo_nome }} nº {{ result.numero }} - {{ result.data|date:"d/m/Y" }}
</a>
</h5>
{% if result.highlights %}
<div class="card-text mt-2">
{% for highlight in result.highlights %}
<p class="mb-1">...{{ highlight|safe }}...</p>
{% endfor %}
</div>
{% endif %}
<div class="mt-3 text-muted small">
<span class="me-3">
<i class="bi bi-star-fill text-warning"></i> Relevância: {{ result.score|floatformat:2 }}
</span>
{% if result.link %}
<a href="{{ result.link }}" target="_blank" class="text-decoration-none">
<i class="bi bi-box-arrow-up-right"></i> Ver original
</a>
{% endif %}
</div>
</div>
</div>
{% endfor %}
{% if pages > 1 %}
<nav aria-label="Page navigation">
<ul class="pagination justify-content-center">
{% if page > 1 %}
<li class="page-item">
<a class="page-link"
href="?q={{ query }}&page={{ page|add:'-1' }}&highlight={{ highlight|lower }}&fuzziness={{ fuzziness }}{% for tipo in tipos_selecionados %}&tipos={{ tipo }}{% endfor %}{% if data_inicio %}&data_inicio={{ data_inicio }}{% endif %}{% if data_fim %}&data_fim={{ data_fim }}{% endif %}">
Anterior
</a>
</li>
{% endif %}
{% for i in page_range %}
<li class="page-item {% if i == page %}active{% endif %}">
<a class="page-link"
href="?q={{ query }}&page={{ i }}&highlight={{ highlight|lower }}&fuzziness={{ fuzziness }}{% for tipo in tipos_selecionados %}&tipos={{ tipo }}{% endfor %}{% if data_inicio %}&data_inicio={{ data_inicio }}{% endif %}{% if data_fim %}&data_fim={{ data_fim }}{% endif %}">
{{ i }}
</a>
</li>
{% endfor %}
{% if page < pages %}
<li class="page-item">
<a class="page-link"
href="?q={{ query }}&page={{ page|add:'1' }}&highlight={{ highlight|lower }}&fuzziness={{ fuzziness }}{% for tipo in tipos_selecionados %}&tipos={{ tipo }}{% endfor %}{% if data_inicio %}&data_inicio={{ data_inicio }}{% endif %}{% if data_fim %}&data_fim={{ data_fim }}{% endif %}">
Próxima
</a>
</li>
{% endif %}
</ul>
</nav>
{% endif %}
{% else %}
<div class="alert alert-warning text-center">
<h4 class="alert-heading">Nenhum resultado encontrado</h4>
<p>Não encontramos resultados para "{{ query }}". Tente ajustar seus termos de busca.</p>
</div>
{% endif %}
</div>
{% else %}
<div class="text-center py-5 bg-light rounded">
<p class="lead text-muted">Digite um termo de busca para encontrar diários oficiais</p>
</div>
{% endif %}
</div>
{% endblock %}

View File

@ -1,330 +0,0 @@
<!DOCTYPE html>
<html lang="pt-BR">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% if query %}{{ query }} - {% endif %}Pesquisa de Documentos</title>
<!-- Bootstrap 5 CSS -->
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
<!-- Fonte personalizada -->
<link href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" rel="stylesheet">
<!-- Ícones do Bootstrap -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.8.1/font/bootstrap-icons.css">
<style>
body {
font-family: 'Roboto', Arial, sans-serif;
background-color: #f8f9fa;
color: #202124;
}
.search-container {
max-width: 650px;
margin: 0 auto;
}
.main-container {
max-width: 650px;
margin: 0 auto;
}
.search-box {
border-radius: 24px;
border: 1px solid #dfe1e5;
box-shadow: none;
height: 44px;
padding-left: 20px;
}
.search-box:focus {
box-shadow: 0 1px 6px rgba(32,33,36,.28);
border-color: rgba(223,225,229,0);
}
.search-button {
border-radius: 24px;
margin-left: 10px;
}
.result-item {
padding: 20px 0;
border-bottom: 1px solid #e0e0e0;
}
.result-item:last-child {
border-bottom: none;
}
.pdf-link {
color: #e74c3c;
margin-left: 10px;
font-size: 0.8em;
text-decoration: none;
}
.result-title {
display: flex;
align-items: center;
justify-content: space-between;
color: #1a0dab;
font-weight: 500;
margin-bottom: 5px;
font-size: 18px;
}
.result-title a {
text-decoration: none;
}
.result-title a:hover {
text-decoration: underline;
}
.result-content {
color: #4d5156;
font-size: 14px;
margin-bottom: 5px;
line-height: 1.58;
}
.result-meta {
color: #70757a;
font-size: 12px;
}
.suggestion {
color: #1a0dab;
text-decoration: none;
}
.suggestion:hover {
text-decoration: underline;
}
mark {
background-color: #ffffc2;
padding: 0;
font-weight: bold;
}
.related-term {
color: #70757a;
font-size: 14px;
margin-right: 10px;
}
.pagination-link {
color: #1a0dab;
padding: 0 10px;
text-decoration: none;
}
.pagination-link.active {
color: #202124;
font-weight: bold;
}
.pagination-link:hover {
text-decoration: underline;
}
.search-stats {
color: #70757a;
font-size: 14px;
margin-bottom: 15px;
}
.header {
padding: 20px 0;
background-color: white;
border-bottom: 1px solid #dfe1e5;
}
.badge-exact-match {
background-color: #e8f0fe;
color: #1a73e8;
border: 1px solid #d2e3fc;
font-weight: normal;
}
.search-tip {
font-size: 13px;
color: #70757a;
margin-top: 5px;
}
</style>
<div class="position-relative">
<input type="text" name="q" class="form-control search-box"
id="searchInput" autocomplete="off">
<div id="suggestionsBox" class="position-absolute w-100 bg-white shadow"></div>
</div>
<script>
// JavaScript para sugestões em tempo real
document.getElementById('searchInput').addEventListener('input', function(e) {
const query = e.target.value;
if(query.length > 2) {
fetch(`/diarios/spellcheck/?q=${encodeURIComponent(query)}`)
.then(response => response.json())
.then(data => {
const suggestionsBox = document.getElementById('suggestionsBox');
suggestionsBox.innerHTML = data.suggestions.map(sug =>
`<div class="suggestion-item p-2 border-bottom cursor-pointer">
${sug}
</div>`
).join('');
});
}
});
// Clique na sugestão
document.getElementById('suggestionsBox').addEventListener('click', function(e) {
if(e.target.classList.contains('suggestion-item')) {
document.getElementById('searchInput').value = e.target.textContent;
this.innerHTML = '';
}
});
</script>
</head>
<body>
<!-- Cabeçalho com barra de pesquisa -->
<header class="header">
<div class="container">
<div class="row align-items-center">
<div class="col-auto">
<a href="/" class="text-decoration-none">
<h3 class="mb-0 text-primary"><i class="bi bi-search"></i> BuscaDocs</h3>
</a>
</div>
<div class="col">
<form action="{% url 'search_view' %}" method="get" class="d-flex">
<input type="text" name="q" class="form-control search-box" value="{{ query }}" placeholder="Pesquisar documentos..." aria-label="Pesquisar">
<button class="btn btn-primary search-button" type="submit"><i class="bi bi-search"></i></button>
</form>
<div class="search-tip">
Use aspas duplas para buscar frases exatas, ex: "documento oficial"
</div>
</div>
</div>
</div>
</header>
<div class="container py-4 main-container">
{% if query %}
<!-- Estatísticas da busca -->
<div class="search-stats">
{% if total_hits > 0 %}
<p>Cerca de {{ total_hits }} resultados encontrados para "{{ query }}"</p>
{% else %}
<p>Nenhum resultado encontrado para "{{ query }}"</p>
{% endif %}
</div>
<!-- Correção ortográfica -->
{% if spelling_correction %}
<div class="mb-4">
<p>Você quis dizer: <a href="?q={{ spelling_correction|urlencode }}" class="suggestion">{{ spelling_correction }}</a>?</p>
</div>
{% endif %}
<!-- Sugestões de termos -->
{% if suggestions %}
<div class="mb-4">
<p>Talvez você esteja procurando por:
{% for suggestion in suggestions %}
<a href="?q={{ suggestion|urlencode }}" class="suggestion me-2">{{ suggestion }}</a>{% if not forloop.last %}, {% endif %}
{% endfor %}
</p>
</div>
{% endif %}
<!-- Resultados da busca -->
{% if results %}
<div class="results-container">
{% for result in results %}
<div class="result-item">
<div class="d-flex gap-2 mb-2">
{% if result.is_exact_match %}
<span class="badge badge-exact-match">Correspondência exata</span>
{% endif %}
{% if result.is_related %}
<span class="badge bg-secondary">Termo relacionado</span>
{% endif %}
</div>
<h5 class="result-title">
<a href="{{ result.pdf_url }}" target="_blank">{{ result.highlighted_title|safe }}</a>
<a href="{{ result.pdf_url }}" target="_blank" class="pdf-link" title="Abrir PDF completo">
<i class="bi bi-file-pdf"></i>
</a>
</h5>
<div class="result-content">{{ result.highlighted_content|safe }}</div>
<div class="result-meta">
<i class="bi bi-calendar-date"></i> {{ result.uploaded_at|date:"d/m/Y" }}
{% if result.matching_pages %}
<span class="ms-3">
<i class="bi bi-file-earmark-text"></i> Páginas encontradas:
{% for page in result.matching_pages %}
<a href="{{ result.pdf_url }}#page={{ page }}" target="_blank" class="badge bg-light text-dark">{{ page }}</a>
{% endfor %}
</span>
{% endif %}
</div>
</div>
{% endfor %}
</div>
<!-- Paginação -->
{% if total_pages > 1 %}
<nav aria-label="Paginação de resultados" class="my-4">
<div class="d-flex justify-content-center">
{% if page > 1 %}
<a href="?q={{ query|urlencode }}&page={{ page|add:'-1' }}" class="pagination-link">
<i class="bi bi-chevron-left"></i> Anterior
</a>
{% endif %}
{% for p in page_range %}
<a href="?q={{ query|urlencode }}&page={{ p }}" class="pagination-link {% if p == page %}active{% endif %}">
{{ p }}
</a>
{% endfor %}
{% if page < total_pages %}
<a href="?q={{ query|urlencode }}&page={{ page|add:'1' }}" class="pagination-link">
Próxima <i class="bi bi-chevron-right"></i>
</a>
{% endif %}
</div>
</nav>
{% endif %}
{% else %}
<div class="alert alert-info" role="alert">
<i class="bi bi-info-circle-fill me-2"></i>
Nenhum documento corresponde aos termos de pesquisa. Tente usar palavras-chave diferentes ou mais gerais.
{% if has_exact_phrases %}
<p class="mt-2 mb-0">Você pesquisou por frases exatas. Tente remover as aspas para uma busca mais ampla.</p>
{% endif %}
</div>
{% endif %}
{% else %}
<!-- Página inicial de pesquisa -->
<div class="text-center py-5">
<h1 class="display-4 mb-4 text-primary"><i class="bi bi-search"></i> BuscaDocs</h1>
<div class="search-container mb-4">
<form action="{% url 'search_view' %}" method="get">
<div class="input-group mb-3">
<input type="text" name="q" class="form-control search-box py-3" placeholder="Pesquisar documentos..." aria-label="Pesquisar">
<button class="btn btn-primary search-button px-4" type="submit">
<i class="bi bi-search"></i> Pesquisar
</button>
</div>
</form>
<p class="text-muted">Pesquise em nossa biblioteca de documentos digitalizados</p>
<div class="mt-3 text-start p-3 border rounded bg-light">
<h5>Dicas de pesquisa:</h5>
<ul class="mb-0">
<li>Use <strong>aspas duplas</strong> para buscar frases exatas: <code>"documento oficial"</code></li>
<li>Tente usar sinônimos se não encontrar resultados</li>
<li>Seja específico para encontrar documentos relevantes</li>
</ul>
</div>
</div>
</div>
{% endif %}
</div>
<!-- Footer -->
<footer class="bg-light py-3 mt-5">
<div class="container text-center">
<p class="text-muted mb-0">© 2025 BuscaDocs - Sistema de Pesquisa de Documentos</p>
</div>
</footer>
<!-- Bootstrap JS Bundle with Popper -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
<!-- Script para sugestões em tempo real (opcional) -->
<script>
document.addEventListener('DOMContentLoaded', function() {
const searchInput = document.querySelector('input[name="q"]');
searchInput.focus();
});
</script>
</body>
</html>

View File

@ -1,7 +1,8 @@
from django.urls import path from django.urls import path
from .views import search_view, spellcheck_view from . import views
urlpatterns = [ urlpatterns = [
path("pesquisa/", search_view, name="search_view"), path('diario/<int:pk>/', views.diario_detail, name='diario_detail'),
path("spellcheck/", spellcheck_view, name="spellcheck_view"), path('diarios/search/', views.search_diarios, name='search_diarios'),
] ]

View File

@ -1,159 +1,83 @@
import json
import debugpy
from django.shortcuts import render from django.shortcuts import render
from elasticsearch_dsl import Search, Q from elasticsearch_dsl import Q
from elasticsearch_dsl.connections import connections from .documents import DiarioOficialDocument
from django.conf import settings
import re
from .documents import PDFDocument
from django.http import JsonResponse
def search_diarios(request):
# Configuração da conexão com o Elasticsearch q = request.GET.get('q', '')
connections.create_connection(hosts=[settings.ELASTICSEARCH_HOSTS])
def spellcheck_view(request):
query = request.GET.get("q", "")
suggestions = []
if query:
s = Search(index="pdf_documents")
s = s.suggest(
"auto_correct",
query,
phrase={
"field": "suggest",
"size": 3,
"gram_size": 3,
"confidence": 2.0,
"direct_generator": [{"field": "suggest", "suggest_mode": "popular"}],
},
)
response = s.execute()
if hasattr(response.suggest, "auto_correct"):
for option in response.suggest.auto_correct[0].options:
suggestions.append(option.text)
return JsonResponse({"suggestions": suggestions})
def search_view(request):
query = request.GET.get('q', '') # Obtém o termo de pesquisa da URL
page = int(request.GET.get('page', 1)) page = int(request.GET.get('page', 1))
size = int(request.GET.get('size', 10))
start = (page - 1) * size
end = start + size
results = [] results = []
suggestions = [] total = 0
spelling_correction = None
total_hits = 0 if q:
per_page = 10 # Busca principal com boost para relevância
query = Q(
if query: 'multi_match',
# Processamento especial para termos entre aspas query=q,
exact_phrases = re.findall(r'"([^"]*)"', query) fields=['content^3', 'tipo.nome^2', 'numero', 'pages.content'],
fuzziness='AUTO'
# Remove os termos entre aspas da consulta principal )
cleaned_query = query
for phrase in exact_phrases: # Pesquisa com highlighting
cleaned_query = cleaned_query.replace(f'"{phrase}"', '') search = DiarioOficialDocument.search()
search = search.query(query)
# Remove espaços extras e pontuação desnecessária search = search.highlight('content', fragment_size=150, number_of_fragments=3)
cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip() search = search.highlight('pages.content', fragment_size=150, number_of_fragments=3)
# Cria uma consulta no Elasticsearch # Paginação
search = Search(index='diarios_oficiais') search = search[start:end]
# Lista para armazenar todas as consultas response = search.execute()
queries = []
total = response.hits.total.value
# Adiciona consulta para termos gerais (com fuzziness para tolerância a erros)
if cleaned_query: for hit in response:
queries.append( # Adicionar destaque
Q('multi_match', highlight = ""
query=cleaned_query, if hasattr(hit.meta, 'highlight'):
fields=['title^3', 'pages.content^2'], if 'content' in hit.meta.highlight:
fuzziness='AUTO', highlight = "...".join(hit.meta.highlight.content)
boost=2)
) # Processando páginas com destaque
highlighted_pages = []
# Adiciona consultas exatas para frases entre aspas (sem fuzziness) if hasattr(hit.meta, 'highlight') and 'pages.content' in hit.meta.highlight:
for phrase in exact_phrases: for i, content in enumerate(hit.meta.highlight['pages.content']):
if phrase.strip(): # Encontre a página correspondente
queries.append( page_number = i + 1 # Lógica simplificada, pode precisar de ajuste
Q('match_phrase', highlighted_pages.append({
pages__content={ 'number': page_number,
'query': phrase, 'content': content
'boost': 2, })
'slop': 0 # Sem flexibilidade na ordem das palavras
}) # Combine dados do documento com os destaques
) result = {
'id': hit.id,
# Combina as consultas com OR (se houver alguma) 'tipo': hit.tipo.nome if hasattr(hit, 'tipo') and hit.tipo else '',
if queries: 'numero': hit.numero,
search = search.query( 'data': hit.data,
Q('bool', should=queries, minimum_should_match=1) 'link': hit.link,
) 'highlight': highlight,
'highlighted_pages': highlighted_pages
# Configuração do highlight para mostrar mais contexto }
search = search.highlight(
'pages.content', results.append(result)
fragment_size=300,
number_of_fragments=2, context = {
pre_tags=['<mark>'], 'query': q,
post_tags=['</mark>']
)
# Paginação
search = search[(page-1)*per_page:page*per_page]
# Executa a consulta
response = search.execute()
total_hits = response.hits.total.value
# Processa os resultados
for hit in response:
# Extrai o conteúdo destacado ou usa o original
if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
highlighted_content = ' ... '.join(hit.meta.highlight['pages.content'])
else:
highlighted_content = ""
# Extrai o título destacado ou usa o original
if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'):
highlighted_title = hit.meta.highlight.title[0]
else:
highlighted_title = hit.title
# Verifica se o resultado corresponde a uma frase exata
is_exact_match = any(phrase.lower() in hit.pages.content.lower() or
phrase.lower() in hit.title.lower()
for phrase in exact_phrases)
results.append({
'id': hit.meta.id,
'title': hit.title,
'highlighted_title': highlighted_title,
'highlighted_content': highlighted_content,
'data': hit.data,
'numero': hit.numero,
'link': hit.link,
'finalizado': hit.finalizado,
'is_exact_match': is_exact_match
})
# Calcula a paginação
total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0
# Renderiza o template com os resultados
return render(request, 'diarios/search_results.html', {
'query': query,
'results': results, 'results': results,
'suggestions': suggestions[:5], # Limita a 5 sugestões 'total': total,
'spelling_correction': spelling_correction,
'total_hits': total_hits,
'page': page, 'page': page,
'total_pages': total_pages, 'size': size,
'page_range': range(max(1, page-2), min(total_pages+1, page+3)), 'total_pages': (total + size - 1) // size if total > 0 else 0,
'has_exact_phrases': bool(exact_phrases) }
})
return render(request, 'diarios/diarios_search.html', context)
def diario_detail(request, pk):
diario = get_object_or_404(Diario, pk=pk)
return render(request, 'diarios/diario_detail.html', {'diario': diario})

View File

@ -16,7 +16,9 @@ django-crispy-forms==2.3 # https://github.com/django-crispy-forms/django-crispy
crispy-bootstrap5==2024.10 # https://github.com/django-crispy-forms/crispy-bootstrap5 crispy-bootstrap5==2024.10 # https://github.com/django-crispy-forms/crispy-bootstrap5
django-compressor==4.5.1 # https://github.com/django-compressor/django-compressor django-compressor==4.5.1 # https://github.com/django-compressor/django-compressor
django-redis==5.4.0 # https://github.com/jazzband/django-redis django-redis==5.4.0 # https://github.com/jazzband/django-redis
djangorestframework
elasticsearch elasticsearch
django-elasticsearch-dsl django-elasticsearch-dsl
PyPDF2 PyPDF2
babel

4
sinonimos.txt Normal file
View File

@ -0,0 +1,4 @@
lei, legislação, norma
processo, procedimento, autos
contrato, acordo, convênio