| " + html.escape(cell.text) + " | " + else: + html_table += "" + html.escape(cell.text) + " | " + html_table += "
|---|
| Name | Size | Date modified | +
|---|---|---|
| {html.escape(entry)} | ' + + f"{html.escape(size)} | " + + f"{html.escape(mtime)} | " + + "
| " + html.escape(cell.text) + " | " + else: + html_table += "" + html.escape(cell.text) + " | " + html_table += "
|---|
{clean_md}
+ #
+ #
+ # """
+
+ # # 使用base64编码并通过goto显示
+ # goto(
+ # "data:text/html;base64," +
+ # base64.b64encode(html_content.encode("utf-8")).decode("utf-8")
+ # )
+
+ # # 触发pageshow事件
+ # page.evaluate("""
+ # const event = new Event('pageshow', {
+ # bubbles: true,
+ # cancelable: false
+ # });
+ # window.dispatchEvent(event);
+ # """)
+
+ # # 关闭浏览器
+ # new_browser.close()
+ except Exception as e:
+ raise Exception(f"Get page markdown error: {str(e)}")
+if __name__ == "__main__":
+ from playwright.sync_api import sync_playwright
+ import io
+ with sync_playwright() as p:
+ # 启动浏览器
+ browser = p.chromium.launch(headless=True)
+ page = browser.new_page()
+
+ # 访问 YouTube 视频
+ url = "https://www.researchgate.net/publication/232696279_The_influence_of_social_environment_on_sex_determination_in_harlequin_shrimp_Hymenocera_picta_Decapoda_Gnathophyllidae"
+ page.goto(url, wait_until="networkidle")
+
+ # 获取页面 HTML
+ html = page.evaluate("document.documentElement.outerHTML;")
+
+ # 使用 MarkdownConverter 转换
+ mdconvert = MarkdownConverter()
+ res = mdconvert.convert_stream(io.StringIO(html), file_extension=".html", url=url)
+
+ print('标题:', res.title)
+ print('\n内容:')
+ print(res.text_content)
+
+ # 关闭浏览器
+ browser.close()
+ # mdconvert = MarkdownConverter()
+ # res = mdconvert.convert_local("/Users/tangjiabin/Documents/reasoning/metachain/eval_data/GAIA/2023/validation/2b3ef98c-cc05-450b-a719-711aee40ac65.mp3")
+ # print('title:', res.title)
+ # print('content:', res.text_content)
\ No newline at end of file
diff --git a/metachain/environment/new.py b/metachain/environment/new.py
new file mode 100644
index 0000000..1ccb6f0
--- /dev/null
+++ b/metachain/environment/new.py
@@ -0,0 +1,568 @@
+import base64
+import io
+import logging
+import pkgutil
+import re
+from typing import Literal
+
+import numpy as np
+import PIL.Image
+import playwright.sync_api
+
+from .constants import BROWSERGYM_ID_ATTRIBUTE as BID_ATTR
+from .constants import BROWSERGYM_SETOFMARKS_ATTRIBUTE as SOM_ATTR
+from .constants import BROWSERGYM_VISIBILITY_ATTRIBUTE as VIS_ATTR
+
+MARK_FRAMES_MAX_TRIES = 3
+
+
+logger = logging.getLogger(__name__)
+
+
+class MarkingError(Exception):
+ pass
+
+
+def _pre_extract(
+ page: playwright.sync_api.Page,
+ tags_to_mark: Literal["all", "standard_html"] = "standard_html",
+ lenient: bool = False,
+):
+ """
+ pre-extraction routine, marks dom elements (set bid and dynamic attributes like value and checked)
+ """
+ js_frame_mark_elements = pkgutil.get_data(__name__, "javascript/frame_mark_elements.js").decode(
+ "utf-8"
+ )
+
+ # we can't run this loop in JS due to Same-Origin Policy
+ # (can't access the content of an iframe from a another one)
+ def mark_frames_recursive(frame, frame_bid: str):
+ assert frame_bid == "" or re.match(r"^[a-z][a-zA-Z]*$", frame_bid)
+ logger.debug(f"Marking frame {repr(frame_bid)}")
+
+ # mark all DOM elements in the frame (it will use the parent frame element's bid as a prefix)
+ warning_msgs = frame.evaluate(
+ js_frame_mark_elements,
+ [frame_bid, BID_ATTR, tags_to_mark],
+ )
+ # print warning messages if any
+ for msg in warning_msgs:
+ logger.warning(msg)
+
+ # recursively mark all descendant frames
+ for child_frame in frame.child_frames:
+ # deal with detached frames
+ if child_frame.is_detached():
+ continue
+ # deal with weird frames (pdf viewer in