2025-03-07 07:10:13 +01:00
|
|
|
from django.db import models
|
|
|
|
|
import PyPDF2
|
2025-03-07 15:31:53 +01:00
|
|
|
import json
|
2025-03-07 07:10:13 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class PDFDocument(models.Model):
|
|
|
|
|
title = models.CharField(max_length=255)
|
2025-03-07 15:31:53 +01:00
|
|
|
file = models.FileField(upload_to="pdfs/")
|
2025-03-07 07:10:13 +01:00
|
|
|
content = models.TextField(blank=True)
|
|
|
|
|
uploaded_at = models.DateTimeField(auto_now_add=True)
|
2025-03-07 15:31:53 +01:00
|
|
|
page_content = models.TextField(blank=True)
|
2025-03-07 07:10:13 +01:00
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
return self.title
|
|
|
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
|
if self.file:
|
|
|
|
|
pdf = PyPDF2.PdfReader(self.file)
|
|
|
|
|
texto = []
|
2025-03-07 15:31:53 +01:00
|
|
|
pages_data = []
|
|
|
|
|
|
|
|
|
|
for i, pagina in enumerate(pdf.pages):
|
|
|
|
|
page_text = pagina.extract_text()
|
|
|
|
|
pages_data.append(
|
|
|
|
|
{
|
|
|
|
|
"number": i + 1,
|
|
|
|
|
"content": page_text,
|
|
|
|
|
}
|
|
|
|
|
)
|
2025-03-07 07:10:13 +01:00
|
|
|
texto.append(pagina.extract_text())
|
2025-03-07 15:31:53 +01:00
|
|
|
self.content = "\n".join(texto)
|
|
|
|
|
self.page_content = json.dumps(pages_data)
|
2025-03-07 07:10:13 +01:00
|
|
|
|
|
|
|
|
super().save(*args, **kwargs)
|