arruma o processo de busca textual nos diarios

This commit is contained in:
root
2025-03-14 17:36:14 +01:00
parent 8d1f6feeaf
commit f2e5cd73b7
15 changed files with 650 additions and 645 deletions

View File

@ -1,157 +1,91 @@
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from .models import PDFDocument
@registry.register_document
class PDFDocumentDocument(Document):
title = fields.TextField()
content = fields.TextField(analyzer="portuguese")
pages = fields.NestedField(
properties={
"number": fields.IntegerField(),
"content": fields.TextField(analyzer="portuguese"),
}
)
class Index:
name = "pdf_documents"
settings = {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"portuguese": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"portuguese_synonyms",
],
},
"portuguese_search": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"suggest_shingle",
],
},
},
"filter": {
"suggest_shingle": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3,
},
"stop": {"type": "stop", "stopwords": "_portuguese_"},
"ascii_folding": {"type": "asciifolding"},
"portuguese_stemmer": {"type": "stemmer", "language": "portuguese"},
"portuguese_synonyms": {
"type": "synonym",
"synonyms_path": "synonyms.txt",
"expand": True,
},
},
},
}
class Django:
model = PDFDocument
fields = ["uploaded_at", "file"]
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from .models import DiarioOficial
@registry.register_document
class DiarioOficialDocument(Document):
# Campos principais
title = fields.TextField()
tipo = fields.KeywordField()
# Campo para arquivo PDF (se aplicável)
arquivo = fields.TextField(attr="arquivo.url")
# Nested field para páginas (usando o page_content)
pages = fields.NestedField(
properties={
"number": fields.IntegerField(),
"content": fields.TextField(analyzer="portuguese")
}
tipo = fields.ObjectField(properties={
'nome': fields.TextField()
})
numero = fields.TextField()
data = fields.DateField()
link = fields.TextField()
# Campo para armazenar todas as páginas para busca
content = fields.TextField(
analyzer='pt_analyzer',
)
# Campo para armazenar páginas individualmente
pages = fields.NestedField(properties={
'number': fields.IntegerField(),
'content': fields.TextField(
analyzer='pt_analyzer',
)
})
class Index:
name = "diarios_oficiais"
name = 'diarios_oficiais'
settings = {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"portuguese": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"portuguese_synonyms",
]
'number_of_shards': 1,
'number_of_replicas': 0,
'analysis': {
'filter': {
'portuguese_stop': {
'type': 'stop',
'stopwords': '_portuguese_'
},
"portuguese_search": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"suggest_shingle",
'portuguese_stemmer': {
'type': 'stemmer',
'language': 'portuguese'
},
'synonym_filter': {
'type': 'synonym',
'synonyms': [
'lei, legislação, norma',
'processo, procedimento, autos',
'contrato, acordo, convênio',
# Adicione mais sinônimos relevantes para o contexto legal
]
}
},
"filter": {
"suggest_shingle": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3
},
"stop": {"type": "stop", "stopwords": "_portuguese_"},
"ascii_folding": {"type": "asciifolding"},
"portuguese_stemmer": {"type": "stemmer", "language": "portuguese"},
"portuguese_synonyms": {
"type": "synonym",
"synonyms_path": "synonyms.txt",
"expand": True
'analyzer': {
'pt_analyzer': {
'tokenizer': 'standard',
'filter': [
'lowercase',
'portuguese_stop',
'portuguese_stemmer',
'synonym_filter'
]
}
}
}
}
class Django:
model = DiarioOficial
fields = [
"data",
"numero",
"link",
'id'
]
def prepare_tipo(self, instance):
return instance.tipo.nome if instance.tipo else None
def prepare_title(self, instance):
return f"{instance.tipo.nome if instance.tipo else 'Diário'} {instance.numero}"
def prepare_pages(self, instance):
# Prepara o campo pages usando o page_content
if instance.tipo:
return {
'nome': instance.tipo.nome
}
return {}
def prepare_content(self, instance):
"""Concatena todo o conteúdo de todas as páginas em um único campo para busca"""
if instance.page_content:
return instance.page_content # page_content já é uma lista de dicionários
return " ".join([page.get('content', '') for page in instance.page_content])
return ""
def prepare_pages(self, instance):
"""Prepara o campo de páginas individuais para exibição e destaque"""
if instance.page_content:
return instance.page_content
return []