arruma o processo de busca textual nos diarios

This commit is contained in:
root
2025-03-14 17:36:14 +01:00
parent 8d1f6feeaf
commit f2e5cd73b7
15 changed files with 650 additions and 645 deletions

View File

@ -84,6 +84,7 @@ LOCAL_APPS = [
"diários_oficiais_alems.users",
"diarios",
"django_elasticsearch_dsl",
"rest_framework",
]
# https://docs.djangoproject.com/en/dev/ref/settings/#installed-apps
INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS
@ -293,3 +294,36 @@ ELASTICSEARCH_DSL = {
"default": {"hosts": "http://elasticsearch:9200"}, # same as above
}
ELASTICSEARCH_HOSTS = "http://elasticsearch:9200"
ELASTICSEARCH_INDEX_SETTINGS = {
'number_of_shards': 1,
'number_of_replicas': 0,
'analysis': {
'filter': {
'portuguese_stop': {
'type': 'stop',
'stopwords': '_portuguese_'
},
'portuguese_stemmer': {
'type': 'stemmer',
'language': 'portuguese'
},
'synonym_filter': {
'type': 'synonym',
'synonyms_path': 'analysis/sinonimos.txt',
}
},
'analyzer': {
'pt_analyzer': {
'tokenizer': 'standard',
'filter': [
'lowercase',
'portuguese_stop',
'portuguese_stemmer',
'synonym_filter'
]
}
}
}
}

View File

@ -1,8 +1,18 @@
from django.contrib import admin
from .models import PDFDocument
from .models import PDFDocument, DiarioOficial, TipoDiarioOficial
from django.db import models
@admin.register(PDFDocument)
class PDFDocumentAdmin(admin.ModelAdmin):
pass
@admin.register(DiarioOficial)
class DiarioOficialAdmin(admin.ModelAdmin):
pass
@admin.register(TipoDiarioOficial)
class TipoDiarioOficialAdmin(admin.ModelAdmin):
pass

View File

@ -0,0 +1,8 @@
from django import template
register = template.Library()
@register.filter
def get_range(value):
return range(value)

View File

@ -1,157 +1,91 @@
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from .models import PDFDocument
@registry.register_document
class PDFDocumentDocument(Document):
title = fields.TextField()
content = fields.TextField(analyzer="portuguese")
pages = fields.NestedField(
properties={
"number": fields.IntegerField(),
"content": fields.TextField(analyzer="portuguese"),
}
)
class Index:
name = "pdf_documents"
settings = {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"portuguese": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"portuguese_synonyms",
],
},
"portuguese_search": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"suggest_shingle",
],
},
},
"filter": {
"suggest_shingle": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3,
},
"stop": {"type": "stop", "stopwords": "_portuguese_"},
"ascii_folding": {"type": "asciifolding"},
"portuguese_stemmer": {"type": "stemmer", "language": "portuguese"},
"portuguese_synonyms": {
"type": "synonym",
"synonyms_path": "synonyms.txt",
"expand": True,
},
},
},
}
class Django:
model = PDFDocument
fields = ["uploaded_at", "file"]
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from .models import DiarioOficial
@registry.register_document
class DiarioOficialDocument(Document):
# Campos principais
title = fields.TextField()
tipo = fields.KeywordField()
# Campo para arquivo PDF (se aplicável)
arquivo = fields.TextField(attr="arquivo.url")
# Nested field para páginas (usando o page_content)
pages = fields.NestedField(
properties={
"number": fields.IntegerField(),
"content": fields.TextField(analyzer="portuguese")
}
tipo = fields.ObjectField(properties={
'nome': fields.TextField()
})
numero = fields.TextField()
data = fields.DateField()
link = fields.TextField()
# Campo para armazenar todas as páginas para busca
content = fields.TextField(
analyzer='pt_analyzer',
)
# Campo para armazenar páginas individualmente
pages = fields.NestedField(properties={
'number': fields.IntegerField(),
'content': fields.TextField(
analyzer='pt_analyzer',
)
})
class Index:
name = "diarios_oficiais"
name = 'diarios_oficiais'
settings = {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"portuguese": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"portuguese_synonyms",
]
'number_of_shards': 1,
'number_of_replicas': 0,
'analysis': {
'filter': {
'portuguese_stop': {
'type': 'stop',
'stopwords': '_portuguese_'
},
"portuguese_search": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"suggest_shingle",
'portuguese_stemmer': {
'type': 'stemmer',
'language': 'portuguese'
},
'synonym_filter': {
'type': 'synonym',
'synonyms': [
'lei, legislação, norma',
'processo, procedimento, autos',
'contrato, acordo, convênio',
# Adicione mais sinônimos relevantes para o contexto legal
]
}
},
"filter": {
"suggest_shingle": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3
},
"stop": {"type": "stop", "stopwords": "_portuguese_"},
"ascii_folding": {"type": "asciifolding"},
"portuguese_stemmer": {"type": "stemmer", "language": "portuguese"},
"portuguese_synonyms": {
"type": "synonym",
"synonyms_path": "synonyms.txt",
"expand": True
'analyzer': {
'pt_analyzer': {
'tokenizer': 'standard',
'filter': [
'lowercase',
'portuguese_stop',
'portuguese_stemmer',
'synonym_filter'
]
}
}
}
}
class Django:
model = DiarioOficial
fields = [
"data",
"numero",
"link",
'id'
]
def prepare_tipo(self, instance):
return instance.tipo.nome if instance.tipo else None
def prepare_title(self, instance):
return f"{instance.tipo.nome if instance.tipo else 'Diário'} {instance.numero}"
def prepare_pages(self, instance):
# Prepara o campo pages usando o page_content
if instance.tipo:
return {
'nome': instance.tipo.nome
}
return {}
def prepare_content(self, instance):
"""Concatena todo o conteúdo de todas as páginas em um único campo para busca"""
if instance.page_content:
return instance.page_content # page_content já é uma lista de dicionários
return " ".join([page.get('content', '') for page in instance.page_content])
return ""
def prepare_pages(self, instance):
"""Prepara o campo de páginas individuais para exibição e destaque"""
if instance.page_content:
return instance.page_content
return []

View File

@ -0,0 +1,24 @@
from django.core.management.base import BaseCommand
from django_elasticsearch_dsl.registries import registry
class Command(BaseCommand):
help = 'Reindexar todos os Diários Oficiais no Elasticsearch'
def handle(self, *args, **options):
self.stdout.write('Iniciando reindexação...')
# Recria os índices
registry.delete_indices()
registry.create_indices()
# Reindexar documentos
for index in registry.get_indices():
self.stdout.write(f'Reindexando {index}...')
documents = []
for doc in registry.get_documents():
if index == doc._index._name:
self.stdout.write(f' + {doc.__name__}')
doc().update()
self.stdout.write(self.style.SUCCESS('Reindexação concluída!'))

View File

@ -1,7 +1,12 @@
import requests
import os
from urllib.parse import urlparse
from django.core.files.base import ContentFile
from django.db import models
import PyPDF2
import json
from django.core.serializers.json import DjangoJSONEncoder
from babel.dates import format_date
class PDFDocument(models.Model):
@ -34,6 +39,7 @@ class PDFDocument(models.Model):
super().save(*args, **kwargs)
class TipoDiarioOficial(models.Model):
nome = models.CharField(max_length=100, unique=True)
@ -57,24 +63,51 @@ class DiarioOficial(models.Model):
numero = models.CharField(max_length=20, unique=True)
link = models.URLField(blank=True, null=True, unique=True)
page_content = models.JSONField(encoder=DjangoJSONEncoder, blank=True, null=True)
def save(self, *args, **kwargs):
if self.file:
pdf = PyPDF2.PdfReader(self.file)
pages_data = []
# Se houver um link, baixa o PDF e extrai o conteúdo
if self.link and not self.arquivo:
try:
# Faz o download do PDF
response = requests.get(self.link)
response.raise_for_status() # Verifica se o download foi bem-sucedido
for i, pagina in enumerate(pdf.pages):
page_text = pagina.extract_text()
pages_data.append(
{
"number": i + 1,
"content": page_text,
}
# Define o nome do arquivo a partir do link
parsed_url = urlparse(self.link)
file_name = (
os.path.basename(parsed_url.path) or f"diario_{self.numero}.pdf"
)
self.page_content = json.dumps(pages_data)
# Salva o arquivo no campo `arquivo`
self.arquivo.save(file_name, ContentFile(response.content), save=False)
# Extrai o conteúdo do PDF
pdf = PyPDF2.PdfReader(self.arquivo)
pages_data = []
for i, pagina in enumerate(pdf.pages):
page_text = pagina.extract_text()
if page_text: # Ignora páginas sem conteúdo
pages_data.append(
{
"number": i + 1,
"content": page_text,
}
)
# Salva o conteúdo das páginas no campo `page_content`
self.page_content = pages_data
except requests.RequestException as e:
print(f"Erro ao baixar o PDF: {e}")
except PyPDF2.PdfReadError as e:
print(f"Erro ao ler o PDF: {e}")
except Exception as e:
print(f"Erro inesperado: {e}")
# Salva o modelo
super().save(*args, **kwargs)
@property
def data_formatada(self):
return format_date(self.data, format="long", locale="pt_BR")
@ -89,4 +122,3 @@ class DiarioOficial(models.Model):
class Meta:
constraints = [models.UniqueConstraint(fields=["numero"], name="unique_numero")]
verbose_name_plural = "Diários Oficiais"

67
diarios/search_service.py Normal file
View File

@ -0,0 +1,67 @@
from elasticsearch_dsl import Q, Search
from .documents import DiarioOficialDocument
class DiarioOficialSearchService:
@staticmethod
def search(query, highlight=True, fuzziness=1, page=1, page_size=10, tipos=None, data_inicio=None, data_fim=None):
# Configura a busca básica
s = DiarioOficialDocument.search().source(excludes=['page_content.content'])
# Filtros
if tipos:
s = s.filter('terms', tipo_nome=tipos)
if data_inicio and data_fim:
s = s.filter('range', data={'gte': data_inicio, 'lte': data_fim})
# Query principal com fuzziness e sinônimos
main_query = Q(
'multi_match',
query=query,
fields=[
'numero^3', # Maior peso para o número
'tipo_nome^2', # Peso médio para o tipo
'page_content.content' # Peso padrão para o conteúdo
],
fuzziness=fuzziness,
analyzer='portuguese_synonyms'
)
s = s.query(main_query)
# Highlighting
if highlight:
s = s.highlight(
'page_content.content',
fragment_size=150,
number_of_fragments=3,
pre_tags=['<mark>'],
post_tags=['</mark>']
)
# Paginação
start = (page - 1) * page_size
end = start + page_size
s = s[start:end]
# Executa a busca
response = s.execute()
# Formata os resultados
results = []
for hit in response:
result = {
'id': hit.id,
'numero': hit.numero,
'data': hit.data,
'link': hit.link,
'tipo_nome': hit.tipo_nome,
'score': hit.meta.score
}
if highlight and hasattr(hit.meta, 'highlight'):
result['highlights'] = hit.meta.highlight['page_content.content'].to_dict()
results.append(result)
return {
'total': response.hits.total.value,
'results': results
}

View File

@ -1,17 +1,15 @@
from django.db.models.signals import post_save
from django.db.models.signals import post_save, post_delete
from django.dispatch import receiver
import PyPDF2
from io import BytesIO
from .models import PDFDocument
@receiver(post_save, sender=PDFDocument)
def extract_text(sender, instance, created, **kwargs):
print("Signal disparado!") # Teste se o Signal está funcionando
if created and instance.file:
pdf = PyPDF2.PdfReader(instance.file)
text = []
for page in pdf.pages:
text.append(page.extract_text())
instance.content = "\n".join(text)
instance.save(update_fields=["content"])
@receiver(post_save, sender=DiarioOficial)
def update_document(sender, instance, **kwargs):
"""Atualizar documento no Elasticsearch quando o objeto for salvo"""
DiarioOficialDocument.update_document(instance)
@receiver(post_delete, sender=DiarioOficial)
def delete_document(sender, instance, **kwargs):
"""Deletar documento do Elasticsearch quando o objeto for deletado"""
document = DiarioOficialDocument.get(id=instance.id)
document.delete()

View File

@ -0,0 +1,124 @@
{% extends "base.html" %}
{% block content %}
<div class="container">
<h1>Busca de Diários Oficiais</h1>
<form method="GET" action="{% url 'search_diarios' %}" class="mb-4">
<div class="input-group">
<input type="text" name="q" class="form-control" value="{{ query }}" placeholder="Digite sua busca...">
<button type="submit" class="btn btn-primary">Buscar</button>
</div>
</form>
{% if query %}
<div class="mb-3">
<h2>Resultados para "{{ query }}"</h2>
<p>Encontrados {{ total }} resultados</p>
</div>
{% if results %}
<div class="search-results">
{% for result in results %}
<div class="card mb-3">
<div class="card-header">
<h5>{{ result.tipo }} nº {{ result.numero }}</h5>
<p class="text-muted">Data: {{ result.data }}</p>
</div>
<div class="card-body">
{% if result.highlight %}
<div class="highlight-section mb-3">
<h6>Destaques:</h6>
<div class="highlight-content">{{ result.highlight|safe }}</div>
</div>
{% endif %}
{% if result.highlighted_pages %}
<div class="highlighted-pages">
<h6>Páginas com o termo buscado:</h6>
<div class="accordion" id="pagesAccordion{{ result.id }}">
{% for page in result.highlighted_pages %}
<div class="accordion-item">
<h2 class="accordion-header">
<button class="accordion-button collapsed" type="button" data-bs-toggle="collapse"
data-bs-target="#page{{ result.id }}_{{ page.number }}">
Página {{ page.number }}
</button>
</h2>
<div id="page{{ result.id }}_{{ page.number }}" class="accordion-collapse collapse"
data-bs-parent="#pagesAccordion{{ result.id }}">
<div class="accordion-body">
{{ page.content|safe }}
</div>
</div>
</div>
{% endfor %}
</div>
</div>
{% endif %}
<div class="mt-3">
<a href="{{ result.link }}" target="_blank" class="btn btn-sm btn-outline-primary">
Ver Diário Online
</a>
<a href="{% url 'diario_detail' result.id %}" class="btn btn-sm btn-outline-secondary">
Ver Detalhes
</a>
</div>
</div>
</div>
{% endfor %}
</div>
<!-- Paginação -->
{% if total_pages > 1 %}
<nav aria-label="Paginação">
<ul class="pagination">
{% if page > 1 %}
<li class="page-item">
<a class="page-link" href="?q={{ query }}&page={{ page|add:'-1' }}&size={{ size }}">Anterior</a>
</li>
{% endif %}
{% for i in total_pages|ljust:"5" %}
{% if i > 0 and i <= total_pages %}
<li class="page-item {% if i == page %}active{% endif %}">
<a class="page-link" href="?q={{ query }}&page={{ i }}&size={{ size }}">{{ i }}</a>
</li>
{% endif %}
{% endfor %}
{% if page < total_pages %}
<li class="page-item">
<a class="page-link" href="?q={{ query }}&page={{ page|add:'1' }}&size={{ size }}">Próxima</a>
</li>
{% endif %}
</ul>
</nav>
{% endif %}
{% else %}
<div class="alert alert-info">
Nenhum resultado encontrado para a sua busca.
</div>
{% endif %}
{% endif %}
</div>
<style>
.highlight-content em {
background-color: #ffeeba;
font-style: normal;
padding: 2px;
border-radius: 2px;
}
.accordion-body em {
background-color: #ffeeba;
font-style: normal;
padding: 2px;
border-radius: 2px;
}
</style>
{% endblock %}

View File

@ -0,0 +1,173 @@
{% extends "base.html" %}
{% load static %}
{% block title %}Busca de Diários Oficiais{% endblock %}
{% block content %}
<div class="container mt-4">
<h1 class="mb-4">Busca de Diários Oficiais</h1>
<div class="card mb-4">
<div class="card-body">
<form method="get" action="{% url 'diario-search' %}">
<div class="row g-3">
<div class="col-md-9">
<label for="q" class="form-label">Buscar por:</label>
<input type="text" id="q" name="q" value="{{ query }}"
class="form-control"
placeholder="Digite palavras-chave, frases ou utilize operadores AND, OR, NOT">
</div>
<div class="col-md-3 d-flex align-items-end">
<button type="submit" class="btn btn-primary w-100">
<i class="bi bi-search"></i> Buscar
</button>
</div>
</div>
<div class="mt-3">
<a class="btn btn-link p-0" data-bs-toggle="collapse" href="#advancedOptions" role="button">
Opções avançadas
</a>
</div>
<div class="collapse" id="advancedOptions">
<div class="row g-3 mt-2">
<div class="col-md-4">
<label class="form-label">Tipos de Diário:</label>
<div class="border rounded p-2" style="max-height: 200px; overflow-y: auto;">
{% for tipo in tipos_disponiveis %}
<div class="form-check">
<input class="form-check-input" type="checkbox"
id="tipo_{{ tipo.id }}" name="tipos" value="{{ tipo.id }}"
{% if tipos_selecionados and tipo.id|stringformat:"i" in tipos_selecionados %}checked{% endif %}>
<label class="form-check-label" for="tipo_{{ tipo.id }}">
{{ tipo.nome }}
</label>
</div>
{% endfor %}
</div>
</div>
<div class="col-md-4">
<label for="data_inicio" class="form-label">Data Inicial:</label>
<input type="date" id="data_inicio" name="data_inicio"
value="{{ data_inicio }}" class="form-control">
</div>
<div class="col-md-4">
<label for="data_fim" class="form-label">Data Final:</label>
<input type="date" id="data_fim" name="data_fim"
value="{{ data_fim }}" class="form-control">
</div>
<div class="col-md-6">
<label for="fuzziness" class="form-label">Tolerância a erros:</label>
<select id="fuzziness" name="fuzziness" class="form-select">
<option value="0" {% if fuzziness == 0 %}selected{% endif %}>Sem tolerância</option>
<option value="1" {% if fuzziness == 1 %}selected{% endif %}>Baixa tolerância</option>
<option value="2" {% if fuzziness == 2 %}selected{% endif %}>Alta tolerância</option>
</select>
</div>
<div class="col-md-6 d-flex align-items-end">
<div class="form-check">
<input class="form-check-input" type="checkbox" id="highlight"
name="highlight" value="true" {% if highlight %}checked{% endif %}>
<label class="form-check-label" for="highlight">
Destacar termos encontrados
</label>
</div>
</div>
</div>
</div>
</form>
</div>
</div>
{% if query %}
<div class="mt-4">
<div class="d-flex justify-content-between align-items-center mb-4">
<h2>Resultados da busca</h2>
<span class="badge bg-primary">{{ total }} resultado(s)</span>
</div>
{% if results %}
{% for result in results %}
<div class="card mb-3">
<div class="card-body">
<h5 class="card-title">
<a href="{% url 'diario-detail' result.id %}?q={{ query|urlencode }}"
class="text-decoration-none">
{{ result.tipo_nome }} nº {{ result.numero }} - {{ result.data|date:"d/m/Y" }}
</a>
</h5>
{% if result.highlights %}
<div class="card-text mt-2">
{% for highlight in result.highlights %}
<p class="mb-1">...{{ highlight|safe }}...</p>
{% endfor %}
</div>
{% endif %}
<div class="mt-3 text-muted small">
<span class="me-3">
<i class="bi bi-star-fill text-warning"></i> Relevância: {{ result.score|floatformat:2 }}
</span>
{% if result.link %}
<a href="{{ result.link }}" target="_blank" class="text-decoration-none">
<i class="bi bi-box-arrow-up-right"></i> Ver original
</a>
{% endif %}
</div>
</div>
</div>
{% endfor %}
{% if pages > 1 %}
<nav aria-label="Page navigation">
<ul class="pagination justify-content-center">
{% if page > 1 %}
<li class="page-item">
<a class="page-link"
href="?q={{ query }}&page={{ page|add:'-1' }}&highlight={{ highlight|lower }}&fuzziness={{ fuzziness }}{% for tipo in tipos_selecionados %}&tipos={{ tipo }}{% endfor %}{% if data_inicio %}&data_inicio={{ data_inicio }}{% endif %}{% if data_fim %}&data_fim={{ data_fim }}{% endif %}">
Anterior
</a>
</li>
{% endif %}
{% for i in page_range %}
<li class="page-item {% if i == page %}active{% endif %}">
<a class="page-link"
href="?q={{ query }}&page={{ i }}&highlight={{ highlight|lower }}&fuzziness={{ fuzziness }}{% for tipo in tipos_selecionados %}&tipos={{ tipo }}{% endfor %}{% if data_inicio %}&data_inicio={{ data_inicio }}{% endif %}{% if data_fim %}&data_fim={{ data_fim }}{% endif %}">
{{ i }}
</a>
</li>
{% endfor %}
{% if page < pages %}
<li class="page-item">
<a class="page-link"
href="?q={{ query }}&page={{ page|add:'1' }}&highlight={{ highlight|lower }}&fuzziness={{ fuzziness }}{% for tipo in tipos_selecionados %}&tipos={{ tipo }}{% endfor %}{% if data_inicio %}&data_inicio={{ data_inicio }}{% endif %}{% if data_fim %}&data_fim={{ data_fim }}{% endif %}">
Próxima
</a>
</li>
{% endif %}
</ul>
</nav>
{% endif %}
{% else %}
<div class="alert alert-warning text-center">
<h4 class="alert-heading">Nenhum resultado encontrado</h4>
<p>Não encontramos resultados para "{{ query }}". Tente ajustar seus termos de busca.</p>
</div>
{% endif %}
</div>
{% else %}
<div class="text-center py-5 bg-light rounded">
<p class="lead text-muted">Digite um termo de busca para encontrar diários oficiais</p>
</div>
{% endif %}
</div>
{% endblock %}

View File

@ -1,330 +0,0 @@
<!DOCTYPE html>
<html lang="pt-BR">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% if query %}{{ query }} - {% endif %}Pesquisa de Documentos</title>
<!-- Bootstrap 5 CSS -->
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
<!-- Fonte personalizada -->
<link href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" rel="stylesheet">
<!-- Ícones do Bootstrap -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.8.1/font/bootstrap-icons.css">
<style>
body {
font-family: 'Roboto', Arial, sans-serif;
background-color: #f8f9fa;
color: #202124;
}
.search-container {
max-width: 650px;
margin: 0 auto;
}
.main-container {
max-width: 650px;
margin: 0 auto;
}
.search-box {
border-radius: 24px;
border: 1px solid #dfe1e5;
box-shadow: none;
height: 44px;
padding-left: 20px;
}
.search-box:focus {
box-shadow: 0 1px 6px rgba(32,33,36,.28);
border-color: rgba(223,225,229,0);
}
.search-button {
border-radius: 24px;
margin-left: 10px;
}
.result-item {
padding: 20px 0;
border-bottom: 1px solid #e0e0e0;
}
.result-item:last-child {
border-bottom: none;
}
.pdf-link {
color: #e74c3c;
margin-left: 10px;
font-size: 0.8em;
text-decoration: none;
}
.result-title {
display: flex;
align-items: center;
justify-content: space-between;
color: #1a0dab;
font-weight: 500;
margin-bottom: 5px;
font-size: 18px;
}
.result-title a {
text-decoration: none;
}
.result-title a:hover {
text-decoration: underline;
}
.result-content {
color: #4d5156;
font-size: 14px;
margin-bottom: 5px;
line-height: 1.58;
}
.result-meta {
color: #70757a;
font-size: 12px;
}
.suggestion {
color: #1a0dab;
text-decoration: none;
}
.suggestion:hover {
text-decoration: underline;
}
mark {
background-color: #ffffc2;
padding: 0;
font-weight: bold;
}
.related-term {
color: #70757a;
font-size: 14px;
margin-right: 10px;
}
.pagination-link {
color: #1a0dab;
padding: 0 10px;
text-decoration: none;
}
.pagination-link.active {
color: #202124;
font-weight: bold;
}
.pagination-link:hover {
text-decoration: underline;
}
.search-stats {
color: #70757a;
font-size: 14px;
margin-bottom: 15px;
}
.header {
padding: 20px 0;
background-color: white;
border-bottom: 1px solid #dfe1e5;
}
.badge-exact-match {
background-color: #e8f0fe;
color: #1a73e8;
border: 1px solid #d2e3fc;
font-weight: normal;
}
.search-tip {
font-size: 13px;
color: #70757a;
margin-top: 5px;
}
</style>
<div class="position-relative">
<input type="text" name="q" class="form-control search-box"
id="searchInput" autocomplete="off">
<div id="suggestionsBox" class="position-absolute w-100 bg-white shadow"></div>
</div>
<script>
// JavaScript para sugestões em tempo real
document.getElementById('searchInput').addEventListener('input', function(e) {
const query = e.target.value;
if(query.length > 2) {
fetch(`/diarios/spellcheck/?q=${encodeURIComponent(query)}`)
.then(response => response.json())
.then(data => {
const suggestionsBox = document.getElementById('suggestionsBox');
suggestionsBox.innerHTML = data.suggestions.map(sug =>
`<div class="suggestion-item p-2 border-bottom cursor-pointer">
${sug}
</div>`
).join('');
});
}
});
// Clique na sugestão
document.getElementById('suggestionsBox').addEventListener('click', function(e) {
if(e.target.classList.contains('suggestion-item')) {
document.getElementById('searchInput').value = e.target.textContent;
this.innerHTML = '';
}
});
</script>
</head>
<body>
<!-- Cabeçalho com barra de pesquisa -->
<header class="header">
<div class="container">
<div class="row align-items-center">
<div class="col-auto">
<a href="/" class="text-decoration-none">
<h3 class="mb-0 text-primary"><i class="bi bi-search"></i> BuscaDocs</h3>
</a>
</div>
<div class="col">
<form action="{% url 'search_view' %}" method="get" class="d-flex">
<input type="text" name="q" class="form-control search-box" value="{{ query }}" placeholder="Pesquisar documentos..." aria-label="Pesquisar">
<button class="btn btn-primary search-button" type="submit"><i class="bi bi-search"></i></button>
</form>
<div class="search-tip">
Use aspas duplas para buscar frases exatas, ex: "documento oficial"
</div>
</div>
</div>
</div>
</header>
<div class="container py-4 main-container">
{% if query %}
<!-- Estatísticas da busca -->
<div class="search-stats">
{% if total_hits > 0 %}
<p>Cerca de {{ total_hits }} resultados encontrados para "{{ query }}"</p>
{% else %}
<p>Nenhum resultado encontrado para "{{ query }}"</p>
{% endif %}
</div>
<!-- Correção ortográfica -->
{% if spelling_correction %}
<div class="mb-4">
<p>Você quis dizer: <a href="?q={{ spelling_correction|urlencode }}" class="suggestion">{{ spelling_correction }}</a>?</p>
</div>
{% endif %}
<!-- Sugestões de termos -->
{% if suggestions %}
<div class="mb-4">
<p>Talvez você esteja procurando por:
{% for suggestion in suggestions %}
<a href="?q={{ suggestion|urlencode }}" class="suggestion me-2">{{ suggestion }}</a>{% if not forloop.last %}, {% endif %}
{% endfor %}
</p>
</div>
{% endif %}
<!-- Resultados da busca -->
{% if results %}
<div class="results-container">
{% for result in results %}
<div class="result-item">
<div class="d-flex gap-2 mb-2">
{% if result.is_exact_match %}
<span class="badge badge-exact-match">Correspondência exata</span>
{% endif %}
{% if result.is_related %}
<span class="badge bg-secondary">Termo relacionado</span>
{% endif %}
</div>
<h5 class="result-title">
<a href="{{ result.pdf_url }}" target="_blank">{{ result.highlighted_title|safe }}</a>
<a href="{{ result.pdf_url }}" target="_blank" class="pdf-link" title="Abrir PDF completo">
<i class="bi bi-file-pdf"></i>
</a>
</h5>
<div class="result-content">{{ result.highlighted_content|safe }}</div>
<div class="result-meta">
<i class="bi bi-calendar-date"></i> {{ result.uploaded_at|date:"d/m/Y" }}
{% if result.matching_pages %}
<span class="ms-3">
<i class="bi bi-file-earmark-text"></i> Páginas encontradas:
{% for page in result.matching_pages %}
<a href="{{ result.pdf_url }}#page={{ page }}" target="_blank" class="badge bg-light text-dark">{{ page }}</a>
{% endfor %}
</span>
{% endif %}
</div>
</div>
{% endfor %}
</div>
<!-- Paginação -->
{% if total_pages > 1 %}
<nav aria-label="Paginação de resultados" class="my-4">
<div class="d-flex justify-content-center">
{% if page > 1 %}
<a href="?q={{ query|urlencode }}&page={{ page|add:'-1' }}" class="pagination-link">
<i class="bi bi-chevron-left"></i> Anterior
</a>
{% endif %}
{% for p in page_range %}
<a href="?q={{ query|urlencode }}&page={{ p }}" class="pagination-link {% if p == page %}active{% endif %}">
{{ p }}
</a>
{% endfor %}
{% if page < total_pages %}
<a href="?q={{ query|urlencode }}&page={{ page|add:'1' }}" class="pagination-link">
Próxima <i class="bi bi-chevron-right"></i>
</a>
{% endif %}
</div>
</nav>
{% endif %}
{% else %}
<div class="alert alert-info" role="alert">
<i class="bi bi-info-circle-fill me-2"></i>
Nenhum documento corresponde aos termos de pesquisa. Tente usar palavras-chave diferentes ou mais gerais.
{% if has_exact_phrases %}
<p class="mt-2 mb-0">Você pesquisou por frases exatas. Tente remover as aspas para uma busca mais ampla.</p>
{% endif %}
</div>
{% endif %}
{% else %}
<!-- Página inicial de pesquisa -->
<div class="text-center py-5">
<h1 class="display-4 mb-4 text-primary"><i class="bi bi-search"></i> BuscaDocs</h1>
<div class="search-container mb-4">
<form action="{% url 'search_view' %}" method="get">
<div class="input-group mb-3">
<input type="text" name="q" class="form-control search-box py-3" placeholder="Pesquisar documentos..." aria-label="Pesquisar">
<button class="btn btn-primary search-button px-4" type="submit">
<i class="bi bi-search"></i> Pesquisar
</button>
</div>
</form>
<p class="text-muted">Pesquise em nossa biblioteca de documentos digitalizados</p>
<div class="mt-3 text-start p-3 border rounded bg-light">
<h5>Dicas de pesquisa:</h5>
<ul class="mb-0">
<li>Use <strong>aspas duplas</strong> para buscar frases exatas: <code>"documento oficial"</code></li>
<li>Tente usar sinônimos se não encontrar resultados</li>
<li>Seja específico para encontrar documentos relevantes</li>
</ul>
</div>
</div>
</div>
{% endif %}
</div>
<!-- Footer -->
<footer class="bg-light py-3 mt-5">
<div class="container text-center">
<p class="text-muted mb-0">© 2025 BuscaDocs - Sistema de Pesquisa de Documentos</p>
</div>
</footer>
<!-- Bootstrap JS Bundle with Popper -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
<!-- Script para sugestões em tempo real (opcional) -->
<script>
document.addEventListener('DOMContentLoaded', function() {
const searchInput = document.querySelector('input[name="q"]');
searchInput.focus();
});
</script>
</body>
</html>

View File

@ -1,7 +1,8 @@
from django.urls import path
from .views import search_view, spellcheck_view
from . import views
urlpatterns = [
path("pesquisa/", search_view, name="search_view"),
path("spellcheck/", spellcheck_view, name="spellcheck_view"),
path('diario/<int:pk>/', views.diario_detail, name='diario_detail'),
path('diarios/search/', views.search_diarios, name='search_diarios'),
]

View File

@ -1,159 +1,83 @@
import json
import debugpy
from django.shortcuts import render
from elasticsearch_dsl import Search, Q
from elasticsearch_dsl.connections import connections
from django.conf import settings
import re
from .documents import PDFDocument
from django.http import JsonResponse
from elasticsearch_dsl import Q
from .documents import DiarioOficialDocument
# Configuração da conexão com o Elasticsearch
connections.create_connection(hosts=[settings.ELASTICSEARCH_HOSTS])
def spellcheck_view(request):
query = request.GET.get("q", "")
suggestions = []
if query:
s = Search(index="pdf_documents")
s = s.suggest(
"auto_correct",
query,
phrase={
"field": "suggest",
"size": 3,
"gram_size": 3,
"confidence": 2.0,
"direct_generator": [{"field": "suggest", "suggest_mode": "popular"}],
},
)
response = s.execute()
if hasattr(response.suggest, "auto_correct"):
for option in response.suggest.auto_correct[0].options:
suggestions.append(option.text)
return JsonResponse({"suggestions": suggestions})
def search_view(request):
query = request.GET.get('q', '') # Obtém o termo de pesquisa da URL
def search_diarios(request):
q = request.GET.get('q', '')
page = int(request.GET.get('page', 1))
size = int(request.GET.get('size', 10))
start = (page - 1) * size
end = start + size
results = []
suggestions = []
spelling_correction = None
total_hits = 0
per_page = 10
if query:
# Processamento especial para termos entre aspas
exact_phrases = re.findall(r'"([^"]*)"', query)
# Remove os termos entre aspas da consulta principal
cleaned_query = query
for phrase in exact_phrases:
cleaned_query = cleaned_query.replace(f'"{phrase}"', '')
# Remove espaços extras e pontuação desnecessária
cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
# Cria uma consulta no Elasticsearch
search = Search(index='diarios_oficiais')
# Lista para armazenar todas as consultas
queries = []
# Adiciona consulta para termos gerais (com fuzziness para tolerância a erros)
if cleaned_query:
queries.append(
Q('multi_match',
query=cleaned_query,
fields=['title^3', 'pages.content^2'],
fuzziness='AUTO',
boost=2)
)
# Adiciona consultas exatas para frases entre aspas (sem fuzziness)
for phrase in exact_phrases:
if phrase.strip():
queries.append(
Q('match_phrase',
pages__content={
'query': phrase,
'boost': 2,
'slop': 0 # Sem flexibilidade na ordem das palavras
})
)
# Combina as consultas com OR (se houver alguma)
if queries:
search = search.query(
Q('bool', should=queries, minimum_should_match=1)
)
# Configuração do highlight para mostrar mais contexto
search = search.highlight(
'pages.content',
fragment_size=300,
number_of_fragments=2,
pre_tags=['<mark>'],
post_tags=['</mark>']
)
# Paginação
search = search[(page-1)*per_page:page*per_page]
# Executa a consulta
response = search.execute()
total_hits = response.hits.total.value
# Processa os resultados
for hit in response:
# Extrai o conteúdo destacado ou usa o original
if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'pages.content'):
highlighted_content = ' ... '.join(hit.meta.highlight['pages.content'])
else:
highlighted_content = ""
# Extrai o título destacado ou usa o original
if hasattr(hit.meta, 'highlight') and hasattr(hit.meta.highlight, 'title'):
highlighted_title = hit.meta.highlight.title[0]
else:
highlighted_title = hit.title
# Verifica se o resultado corresponde a uma frase exata
is_exact_match = any(phrase.lower() in hit.pages.content.lower() or
phrase.lower() in hit.title.lower()
for phrase in exact_phrases)
results.append({
'id': hit.meta.id,
'title': hit.title,
'highlighted_title': highlighted_title,
'highlighted_content': highlighted_content,
'data': hit.data,
'numero': hit.numero,
'link': hit.link,
'finalizado': hit.finalizado,
'is_exact_match': is_exact_match
})
# Calcula a paginação
total_pages = (total_hits + per_page - 1) // per_page if total_hits > 0 else 0
# Renderiza o template com os resultados
return render(request, 'diarios/search_results.html', {
'query': query,
total = 0
if q:
# Busca principal com boost para relevância
query = Q(
'multi_match',
query=q,
fields=['content^3', 'tipo.nome^2', 'numero', 'pages.content'],
fuzziness='AUTO'
)
# Pesquisa com highlighting
search = DiarioOficialDocument.search()
search = search.query(query)
search = search.highlight('content', fragment_size=150, number_of_fragments=3)
search = search.highlight('pages.content', fragment_size=150, number_of_fragments=3)
# Paginação
search = search[start:end]
response = search.execute()
total = response.hits.total.value
for hit in response:
# Adicionar destaque
highlight = ""
if hasattr(hit.meta, 'highlight'):
if 'content' in hit.meta.highlight:
highlight = "...".join(hit.meta.highlight.content)
# Processando páginas com destaque
highlighted_pages = []
if hasattr(hit.meta, 'highlight') and 'pages.content' in hit.meta.highlight:
for i, content in enumerate(hit.meta.highlight['pages.content']):
# Encontre a página correspondente
page_number = i + 1 # Lógica simplificada, pode precisar de ajuste
highlighted_pages.append({
'number': page_number,
'content': content
})
# Combine dados do documento com os destaques
result = {
'id': hit.id,
'tipo': hit.tipo.nome if hasattr(hit, 'tipo') and hit.tipo else '',
'numero': hit.numero,
'data': hit.data,
'link': hit.link,
'highlight': highlight,
'highlighted_pages': highlighted_pages
}
results.append(result)
context = {
'query': q,
'results': results,
'suggestions': suggestions[:5], # Limita a 5 sugestões
'spelling_correction': spelling_correction,
'total_hits': total_hits,
'total': total,
'page': page,
'total_pages': total_pages,
'page_range': range(max(1, page-2), min(total_pages+1, page+3)),
'has_exact_phrases': bool(exact_phrases)
})
'size': size,
'total_pages': (total + size - 1) // size if total > 0 else 0,
}
return render(request, 'diarios/diarios_search.html', context)
def diario_detail(request, pk):
diario = get_object_or_404(Diario, pk=pk)
return render(request, 'diarios/diario_detail.html', {'diario': diario})

View File

@ -16,7 +16,9 @@ django-crispy-forms==2.3 # https://github.com/django-crispy-forms/django-crispy
crispy-bootstrap5==2024.10 # https://github.com/django-crispy-forms/crispy-bootstrap5
django-compressor==4.5.1 # https://github.com/django-compressor/django-compressor
django-redis==5.4.0 # https://github.com/jazzband/django-redis
djangorestframework
elasticsearch
django-elasticsearch-dsl
PyPDF2
babel

4
sinonimos.txt Normal file
View File

@ -0,0 +1,4 @@
lei, legislação, norma
processo, procedimento, autos
contrato, acordo, convênio