💩(backend) add document content endpoint

Get the content of a document in markdown format.
Ex: http://localhost:8071/api/v1.0/documents/<ID>/content/
This commit is contained in:
Anthony LC 2025-06-03 12:12:19 +02:00
parent 23860065e1
commit 3cb7aeb7ec
3 changed files with 131 additions and 1 deletions

View file

@ -33,7 +33,7 @@ from rest_framework import response as drf_response
from rest_framework.permissions import AllowAny
from rest_framework.throttling import UserRateThrottle
from core import authentication, enums, models
from core import authentication, enums, models, utils as core_utils
from core.services.ai_services import AIService
from core.services.collaboration_services import CollaborationService
from core.utils import extract_attachments, filter_descendants
@ -1353,6 +1353,25 @@ class DocumentViewSet(
}
return drf.response.Response(body, status=drf.status.HTTP_200_OK)
@drf.decorators.action(detail=True, methods=["get"], url_path="content")
def content(self, request, *args, **kwargs):
"""
Get the content of a document
"""
document = self.get_object()
# content_type = response.headers.get("Content-Type", "")
base64_yjs_content = document.content
content = core_utils.base64_yjs_to_markdown(base64_yjs_content)
body = {
"content": content,
}
return drf.response.Response(body, status=drf.status.HTTP_200_OK)
@drf.decorators.action(
detail=True,

View file

@ -839,6 +839,7 @@ class Document(MP_Node, BaseModel):
"children_list": can_get,
"children_create": can_update and user.is_authenticated,
"collaboration_auth": can_get,
"content": can_get,
"cors_proxy": can_get,
"descendants": can_get,
"destroy": is_owner,

View file

@ -66,6 +66,116 @@ def base64_yjs_to_text(base64_string):
soup = BeautifulSoup(blocknote_structure, "lxml-xml")
return soup.get_text(separator=" ", strip=True)
def base64_yjs_to_markdown(base64_string: str) -> str:
xml_content = base64_yjs_to_xml(base64_string)
soup = BeautifulSoup(xml_content, "lxml-xml")
md_lines: list[str] = []
def walk(node) -> None:
if not getattr(node, "name", None):
return
# Treat the synthetic “[document]” tag exactly like a wrapper
if node.name in {"[document]", "blockGroup", "blockContainer"}:
for child in node.find_all(recursive=False):
walk(child)
if node.name == "blockContainer":
md_lines.append("") # paragraph break
return
# ----------- content nodes -------------
if node.name == "heading":
level = int(node.get("level", 1))
md_lines.extend([("#" * level) + " " + process_inline_formatting(node), ""])
elif node.name == "paragraph":
md_lines.extend([process_inline_formatting(node), ""])
elif node.name == "bulletListItem":
md_lines.append("- " + process_inline_formatting(node))
elif node.name == "numberedListItem":
idx = node.get("index", "1")
md_lines.append(f"{idx}. " + process_inline_formatting(node))
elif node.name == "checkListItem":
checked = "x" if node.get("checked") == "true" else " "
md_lines.append(f"- [{checked}] " + process_inline_formatting(node))
elif node.name == "codeBlock":
lang = node.get("language", "")
code = node.get_text("", strip=False)
md_lines.extend([f"```{lang}", code, "```", ""])
elif node.name in {"quote", "blockquote"}:
quote = process_inline_formatting(node)
for line in quote.splitlines() or [""]:
md_lines.append("> " + line)
md_lines.append("")
elif node.name == "divider":
md_lines.extend(["---", ""])
elif node.name == "callout":
emoji = node.get("emoji", "💡")
md_lines.extend([f"> {emoji} {process_inline_formatting(node)}", ""])
elif node.name == "img":
src = node.get("src", "")
alt = node.get("alt", "")
md_lines.extend([f"![{alt}]({src})", ""])
# unknown tags are ignored
# kick-off: start at the synthetic root
walk(soup)
# collapse accidental multiple blank lines
cleaned: list[str] = []
for line in md_lines:
if line == "" and (not cleaned or cleaned[-1] == ""):
continue
cleaned.append(line)
return "\n".join(cleaned).rstrip() + "\n"
def process_inline_formatting(element):
"""
Process inline formatting elements like bold, italic, underline, etc.
and convert them to markdown syntax.
"""
result = ""
# If it's just a text node, return the text
if isinstance(element, str):
return element
# Process children elements
for child in element.contents:
if isinstance(child, str):
result += child
elif hasattr(child, 'name'):
if child.name == "bold":
result += "**" + process_inline_formatting(child) + "**"
elif child.name == "italic":
result += "*" + process_inline_formatting(child) + "*"
elif child.name == "underline":
result += "__" + process_inline_formatting(child) + "__"
elif child.name == "strike":
result += "~~" + process_inline_formatting(child) + "~~"
elif child.name == "code":
result += "`" + process_inline_formatting(child) + "`"
elif child.name == "link":
href = child.get("href", "")
text = process_inline_formatting(child)
result += f"[{text}]({href})"
else:
# For other elements, just process their contents
result += process_inline_formatting(child)
return result
def extract_attachments(content):
"""Helper method to extract media paths from a document's content."""