Files
Diarios-Oficiais-ALEMS/diarios/documents.py

68 lines
2.3 KiB
Python
Raw Normal View History

from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from .models import PDFDocument
@registry.register_document
class PDFDocumentDocument(Document):
title = fields.TextField()
content = fields.TextField(analyzer="portuguese")
pages = fields.NestedField(
properties={
"number": fields.IntegerField(),
"content": fields.TextField(analyzer="portuguese"),
}
)
class Index:
name = "pdf_documents"
settings = {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"portuguese": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"portuguese_synonyms",
],
},
"portuguese_search": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"ascii_folding",
"portuguese_stemmer",
"stop",
"suggest_shingle",
],
},
},
"filter": {
"suggest_shingle": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3,
},
"stop": {"type": "stop", "stopwords": "_portuguese_"},
"ascii_folding": {"type": "asciifolding"},
"portuguese_stemmer": {"type": "stemmer", "language": "portuguese"},
"portuguese_synonyms": {
"type": "synonym",
"synonyms_path": "synonyms.txt",
"expand": True,
},
},
},
}
class Django:
model = PDFDocument
fields = ["uploaded_at", "file"]