2025-03-07 07:10:13 +01:00
|
|
|
from django_elasticsearch_dsl import Document, fields
|
|
|
|
|
from django_elasticsearch_dsl.registries import registry
|
|
|
|
|
from .models import PDFDocument
|
|
|
|
|
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
@registry.register_document
|
|
|
|
|
class PDFDocumentDocument(Document):
|
|
|
|
|
title = fields.TextField()
|
2025-03-07 15:31:53 +01:00
|
|
|
content = fields.TextField(analyzer="portuguese")
|
|
|
|
|
pages = fields.NestedField(
|
|
|
|
|
properties={
|
|
|
|
|
"number": fields.IntegerField(),
|
|
|
|
|
"content": fields.TextField(analyzer="portuguese"),
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
class Index:
|
2025-03-07 15:31:53 +01:00
|
|
|
name = "pdf_documents"
|
2025-03-07 07:10:13 +01:00
|
|
|
settings = {
|
2025-03-07 15:31:53 +01:00
|
|
|
"number_of_shards": 1,
|
|
|
|
|
"number_of_replicas": 0,
|
|
|
|
|
"analysis": {
|
|
|
|
|
"analyzer": {
|
|
|
|
|
"portuguese": {
|
|
|
|
|
"type": "custom",
|
|
|
|
|
"tokenizer": "standard",
|
|
|
|
|
"filter": [
|
|
|
|
|
"lowercase",
|
|
|
|
|
"ascii_folding",
|
|
|
|
|
"portuguese_stemmer",
|
|
|
|
|
"stop",
|
|
|
|
|
"portuguese_synonyms",
|
|
|
|
|
],
|
2025-03-07 07:10:13 +01:00
|
|
|
},
|
2025-03-07 15:31:53 +01:00
|
|
|
"portuguese_search": {
|
|
|
|
|
"type": "custom",
|
|
|
|
|
"tokenizer": "standard",
|
|
|
|
|
"filter": [
|
|
|
|
|
"lowercase",
|
|
|
|
|
"ascii_folding",
|
|
|
|
|
"portuguese_stemmer",
|
|
|
|
|
"stop",
|
|
|
|
|
"suggest_shingle",
|
|
|
|
|
],
|
2025-03-07 07:10:13 +01:00
|
|
|
},
|
2025-03-07 15:31:53 +01:00
|
|
|
},
|
|
|
|
|
"filter": {
|
|
|
|
|
"suggest_shingle": {
|
|
|
|
|
"type": "shingle",
|
|
|
|
|
"min_shingle_size": 2,
|
|
|
|
|
"max_shingle_size": 3,
|
2025-03-07 07:10:13 +01:00
|
|
|
},
|
2025-03-07 15:31:53 +01:00
|
|
|
"stop": {"type": "stop", "stopwords": "_portuguese_"},
|
|
|
|
|
"ascii_folding": {"type": "asciifolding"},
|
|
|
|
|
"portuguese_stemmer": {"type": "stemmer", "language": "portuguese"},
|
|
|
|
|
"portuguese_synonyms": {
|
|
|
|
|
"type": "synonym",
|
|
|
|
|
"synonyms_path": "synonyms.txt",
|
|
|
|
|
"expand": True,
|
2025-03-07 07:10:13 +01:00
|
|
|
},
|
2025-03-07 15:31:53 +01:00
|
|
|
},
|
|
|
|
|
},
|
2025-03-07 07:10:13 +01:00
|
|
|
}
|
2025-03-07 15:31:53 +01:00
|
|
|
|
2025-03-07 07:10:13 +01:00
|
|
|
class Django:
|
|
|
|
|
model = PDFDocument
|
2025-03-07 15:31:53 +01:00
|
|
|
fields = ["uploaded_at", "file"]
|