mirror of
https://github.com/suitenumerique/docs
synced 2026-04-21 13:37:20 +00:00
⚗️(backend) function to extract text from base64 yjs document
Function to extract text from base64 yjs document. Can be usefull if we need to index the content of the documents.
This commit is contained in:
parent
ac86a4e7f7
commit
1ee8e5fdba
4 changed files with 51 additions and 1 deletions
|
|
@ -9,6 +9,10 @@ and this project adheres to
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
## Added
|
||||
|
||||
- ⚗️(backend) Extract text from base64 yjs document #270
|
||||
|
||||
|
||||
## [1.4.0] - 2024-09-17
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from django.core import mail
|
|||
|
||||
import pytest
|
||||
|
||||
from core.utils import email_invitation
|
||||
from core.utils import email_invitation, yjs_base64_to_text
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
|
@ -85,3 +85,29 @@ def test_utils__email_invitation_failed(mock_logger, _mock_send_mail):
|
|||
|
||||
assert email == "guest@example.com"
|
||||
assert isinstance(exception, smtplib.SMTPException)
|
||||
|
||||
|
||||
def test_yjs_base64_to_text():
|
||||
"""
|
||||
Test extract_text_from_saved_yjs_document
|
||||
This base64 string is an example of what is saved in the database.
|
||||
This base64 is generated from the blocknote editor, it contains
|
||||
the text \n# *Hello* \n- w**or**ld
|
||||
"""
|
||||
base64_string = (
|
||||
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
|
||||
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
|
||||
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
|
||||
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
|
||||
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
|
||||
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
|
||||
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
|
||||
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
|
||||
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
|
||||
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
|
||||
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
|
||||
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
|
||||
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
|
||||
)
|
||||
|
||||
assert yjs_base64_to_text(base64_string) == "Hello world"
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
Utilities for the core app.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import smtplib
|
||||
from logging import getLogger
|
||||
|
||||
|
|
@ -12,6 +13,9 @@ from django.template.loader import render_to_string
|
|||
from django.utils.translation import gettext_lazy as _
|
||||
from django.utils.translation import override
|
||||
|
||||
import y_py as Y
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
|
|
@ -38,3 +42,17 @@ def email_invitation(language, email, document_id):
|
|||
|
||||
except smtplib.SMTPException as exception:
|
||||
logger.error("invitation to %s was not sent: %s", email, exception)
|
||||
|
||||
|
||||
def yjs_base64_to_text(base64_string):
|
||||
"""Extract text from base64 yjs document"""
|
||||
|
||||
decoded_bytes = base64.b64decode(base64_string)
|
||||
uint8_array = bytearray(decoded_bytes)
|
||||
|
||||
doc = Y.YDoc() # pylint: disable=E1101
|
||||
Y.apply_update(doc, uint8_array) # pylint: disable=E1101
|
||||
blocknote_structure = str(doc.get_xml_element("document-store"))
|
||||
|
||||
soup = BeautifulSoup(blocknote_structure, "html.parser")
|
||||
return soup.get_text(separator=" ").strip()
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ license = { file = "LICENSE" }
|
|||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"beautifulsoup4==4.12.3",
|
||||
"boto3==1.35.10",
|
||||
"Brotli==1.1.0",
|
||||
"celery[redis]==5.4.0",
|
||||
|
|
@ -57,6 +58,7 @@ dependencies = [
|
|||
"WeasyPrint>=60.2",
|
||||
"whitenoise==6.7.0",
|
||||
"mozilla-django-oidc==4.0.1",
|
||||
"y-py==0.5.5",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
|
|
|||
Loading…
Reference in a new issue