⚗️(backend) function to extract text from base64 yjs document

Function to extract text from base64 yjs document.
Can be usefull if we need to index the content
of the documents.
This commit is contained in:
Anthony LC 2024-09-20 10:43:24 +02:00
parent ac86a4e7f7
commit 1ee8e5fdba
4 changed files with 51 additions and 1 deletions

View file

@ -9,6 +9,10 @@ and this project adheres to
## [Unreleased]
## Added
- ⚗️(backend) Extract text from base64 yjs document #270
## [1.4.0] - 2024-09-17

View file

@ -10,7 +10,7 @@ from django.core import mail
import pytest
from core.utils import email_invitation
from core.utils import email_invitation, yjs_base64_to_text
pytestmark = pytest.mark.django_db
@ -85,3 +85,29 @@ def test_utils__email_invitation_failed(mock_logger, _mock_send_mail):
assert email == "guest@example.com"
assert isinstance(exception, smtplib.SMTPException)
def test_yjs_base64_to_text():
"""
Test extract_text_from_saved_yjs_document
This base64 string is an example of what is saved in the database.
This base64 is generated from the blocknote editor, it contains
the text \n# *Hello* \n- w**or**ld
"""
base64_string = (
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
)
assert yjs_base64_to_text(base64_string) == "Hello world"

View file

@ -2,6 +2,7 @@
Utilities for the core app.
"""
import base64
import smtplib
from logging import getLogger
@ -12,6 +13,9 @@ from django.template.loader import render_to_string
from django.utils.translation import gettext_lazy as _
from django.utils.translation import override
import y_py as Y
from bs4 import BeautifulSoup
logger = getLogger(__name__)
@ -38,3 +42,17 @@ def email_invitation(language, email, document_id):
except smtplib.SMTPException as exception:
logger.error("invitation to %s was not sent: %s", email, exception)
def yjs_base64_to_text(base64_string):
"""Extract text from base64 yjs document"""
decoded_bytes = base64.b64decode(base64_string)
uint8_array = bytearray(decoded_bytes)
doc = Y.YDoc() # pylint: disable=E1101
Y.apply_update(doc, uint8_array) # pylint: disable=E1101
blocknote_structure = str(doc.get_xml_element("document-store"))
soup = BeautifulSoup(blocknote_structure, "html.parser")
return soup.get_text(separator=" ").strip()

View file

@ -25,6 +25,7 @@ license = { file = "LICENSE" }
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"beautifulsoup4==4.12.3",
"boto3==1.35.10",
"Brotli==1.1.0",
"celery[redis]==5.4.0",
@ -57,6 +58,7 @@ dependencies = [
"WeasyPrint>=60.2",
"whitenoise==6.7.0",
"mozilla-django-oidc==4.0.1",
"y-py==0.5.5",
]
[project.urls]