from django_elasticsearch_dsl import Document, fields from django_elasticsearch_dsl.registries import registry from .models import PDFDocument @registry.register_document class PDFDocumentDocument(Document): title = fields.TextField() content = fields.TextField(analyzer="portuguese") pages = fields.NestedField( properties={ "number": fields.IntegerField(), "content": fields.TextField(analyzer="portuguese"), } ) class Index: name = "pdf_documents" settings = { "number_of_shards": 1, "number_of_replicas": 0, "analysis": { "analyzer": { "portuguese": { "type": "custom", "tokenizer": "standard", "filter": [ "lowercase", "ascii_folding", "portuguese_stemmer", "stop", "portuguese_synonyms", ], }, "portuguese_search": { "type": "custom", "tokenizer": "standard", "filter": [ "lowercase", "ascii_folding", "portuguese_stemmer", "stop", "suggest_shingle", ], }, }, "filter": { "suggest_shingle": { "type": "shingle", "min_shingle_size": 2, "max_shingle_size": 3, }, "stop": {"type": "stop", "stopwords": "_portuguese_"}, "ascii_folding": {"type": "asciifolding"}, "portuguese_stemmer": {"type": "stemmer", "language": "portuguese"}, "portuguese_synonyms": { "type": "synonym", "synonyms_path": "synonyms.txt", "expand": True, }, }, }, } class Django: model = PDFDocument fields = ["uploaded_at", "file"]