mirror of https://github.com/langgenius/dify.git
fix: fix parse mailto / http link in table cell (#33224)
Co-authored-by: Oz <oz-agent@warp.dev>
This commit is contained in:
parent
27f9cdedad
commit
54637144c5
|
|
@ -204,26 +204,61 @@ class WordExtractor(BaseExtractor):
|
|||
return " ".join(unique_content)
|
||||
|
||||
def _parse_cell_paragraph(self, paragraph, image_map):
|
||||
paragraph_content = []
|
||||
for run in paragraph.runs:
|
||||
if run.element.xpath(".//a:blip"):
|
||||
for blip in run.element.xpath(".//a:blip"):
|
||||
image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
|
||||
if not image_id:
|
||||
continue
|
||||
rel = paragraph.part.rels.get(image_id)
|
||||
if rel is None:
|
||||
continue
|
||||
# For external images, use image_id as key; for internal, use target_part
|
||||
if rel.is_external:
|
||||
if image_id in image_map:
|
||||
paragraph_content.append(image_map[image_id])
|
||||
else:
|
||||
image_part = rel.target_part
|
||||
if image_part in image_map:
|
||||
paragraph_content.append(image_map[image_part])
|
||||
else:
|
||||
paragraph_content.append(run.text)
|
||||
paragraph_content: list[str] = []
|
||||
|
||||
for child in paragraph._element:
|
||||
tag = child.tag
|
||||
if tag == qn("w:hyperlink"):
|
||||
# Note: w:hyperlink elements may also use w:anchor for internal bookmarks.
|
||||
# This extractor intentionally only converts external links (HTTP/mailto, etc.)
|
||||
# that are backed by a relationship id (r:id) with rel.is_external == True.
|
||||
# Hyperlinks without such an external rel (including anchor-only bookmarks)
|
||||
# are left as plain text link_text.
|
||||
r_id = child.get(qn("r:id"))
|
||||
link_text_parts: list[str] = []
|
||||
for run_elem in child.findall(qn("w:r")):
|
||||
run = Run(run_elem, paragraph)
|
||||
if run.text:
|
||||
link_text_parts.append(run.text)
|
||||
link_text = "".join(link_text_parts).strip()
|
||||
if r_id:
|
||||
try:
|
||||
rel = paragraph.part.rels.get(r_id)
|
||||
if rel:
|
||||
target_ref = getattr(rel, "target_ref", None)
|
||||
if target_ref:
|
||||
parsed_target = urlparse(str(target_ref))
|
||||
if rel.is_external or parsed_target.scheme in ("http", "https", "mailto"):
|
||||
display_text = link_text or str(target_ref)
|
||||
link_text = f"[{display_text}]({target_ref})"
|
||||
except Exception:
|
||||
logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id)
|
||||
if link_text:
|
||||
paragraph_content.append(link_text)
|
||||
|
||||
elif tag == qn("w:r"):
|
||||
run = Run(child, paragraph)
|
||||
if run.element.xpath(".//a:blip"):
|
||||
for blip in run.element.xpath(".//a:blip"):
|
||||
image_id = blip.get(
|
||||
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
||||
)
|
||||
if not image_id:
|
||||
continue
|
||||
rel = paragraph.part.rels.get(image_id)
|
||||
if rel is None:
|
||||
continue
|
||||
if rel.is_external:
|
||||
if image_id in image_map:
|
||||
paragraph_content.append(image_map[image_id])
|
||||
else:
|
||||
image_part = rel.target_part
|
||||
if image_part in image_map:
|
||||
paragraph_content.append(image_map[image_part])
|
||||
else:
|
||||
if run.text:
|
||||
paragraph_content.append(run.text)
|
||||
|
||||
return "".join(paragraph_content).strip()
|
||||
|
||||
def parse_docx(self, docx_path):
|
||||
|
|
|
|||
|
|
@ -423,15 +423,6 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch):
|
|||
markdown = extractor._table_to_markdown(table, {})
|
||||
assert markdown == "| H1 | H2 |\n| --- | --- |\n| A | B |"
|
||||
|
||||
class FakeRunElement:
|
||||
def __init__(self, blips):
|
||||
self._blips = blips
|
||||
|
||||
def xpath(self, pattern):
|
||||
if pattern == ".//a:blip":
|
||||
return self._blips
|
||||
return []
|
||||
|
||||
class FakeBlip:
|
||||
def __init__(self, image_id):
|
||||
self.image_id = image_id
|
||||
|
|
@ -439,11 +430,31 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch):
|
|||
def get(self, key):
|
||||
return self.image_id
|
||||
|
||||
class FakeRunChild:
|
||||
def __init__(self, blips, text=""):
|
||||
self._blips = blips
|
||||
self.text = text
|
||||
self.tag = qn("w:r")
|
||||
|
||||
def xpath(self, pattern):
|
||||
if pattern == ".//a:blip":
|
||||
return self._blips
|
||||
return []
|
||||
|
||||
class FakeRun:
|
||||
def __init__(self, element, paragraph):
|
||||
# Mirror the subset used by _parse_cell_paragraph
|
||||
self.element = element
|
||||
self.text = getattr(element, "text", "")
|
||||
|
||||
# Patch we.Run so our lightweight child objects work with the extractor
|
||||
monkeypatch.setattr(we, "Run", FakeRun)
|
||||
|
||||
image_part = object()
|
||||
paragraph = SimpleNamespace(
|
||||
runs=[
|
||||
SimpleNamespace(element=FakeRunElement([FakeBlip(None), FakeBlip("ext"), FakeBlip("int")]), text=""),
|
||||
SimpleNamespace(element=FakeRunElement([]), text="plain"),
|
||||
_element=[
|
||||
FakeRunChild([FakeBlip(None), FakeBlip("ext"), FakeBlip("int")], text=""),
|
||||
FakeRunChild([], text="plain"),
|
||||
],
|
||||
part=SimpleNamespace(
|
||||
rels={
|
||||
|
|
@ -452,6 +463,7 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch):
|
|||
}
|
||||
),
|
||||
)
|
||||
|
||||
image_map = {"ext": "EXT-IMG", image_part: "INT-IMG"}
|
||||
assert extractor._parse_cell_paragraph(paragraph, image_map) == "EXT-IMGINT-IMGplain"
|
||||
|
||||
|
|
@ -625,3 +637,83 @@ def test_parse_docx_covers_drawing_shapes_hyperlink_error_and_table_branch(monke
|
|||
assert "BrokenLink" in content
|
||||
assert "TABLE-MARKDOWN" in content
|
||||
logger_exception.assert_called_once()
|
||||
|
||||
|
||||
def test_parse_cell_paragraph_hyperlink_in_table_cell_http():
|
||||
doc = Document()
|
||||
table = doc.add_table(rows=1, cols=1)
|
||||
cell = table.cell(0, 0)
|
||||
p = cell.paragraphs[0]
|
||||
|
||||
# Build modern hyperlink inside table cell
|
||||
r_id = "rIdHttp1"
|
||||
hyperlink = OxmlElement("w:hyperlink")
|
||||
hyperlink.set(qn("r:id"), r_id)
|
||||
|
||||
run_elem = OxmlElement("w:r")
|
||||
t = OxmlElement("w:t")
|
||||
t.text = "Dify"
|
||||
run_elem.append(t)
|
||||
hyperlink.append(run_elem)
|
||||
p._p.append(hyperlink)
|
||||
|
||||
# Relationship for external http link
|
||||
doc.part.rels.add_relationship(
|
||||
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
|
||||
"https://dify.ai",
|
||||
r_id,
|
||||
is_external=True,
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
|
||||
doc.save(tmp.name)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
reopened = Document(tmp_path)
|
||||
para = reopened.tables[0].cell(0, 0).paragraphs[0]
|
||||
extractor = object.__new__(WordExtractor)
|
||||
out = extractor._parse_cell_paragraph(para, {})
|
||||
assert out == "[Dify](https://dify.ai)"
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
|
||||
|
||||
def test_parse_cell_paragraph_hyperlink_in_table_cell_mailto():
|
||||
doc = Document()
|
||||
table = doc.add_table(rows=1, cols=1)
|
||||
cell = table.cell(0, 0)
|
||||
p = cell.paragraphs[0]
|
||||
|
||||
r_id = "rIdMail1"
|
||||
hyperlink = OxmlElement("w:hyperlink")
|
||||
hyperlink.set(qn("r:id"), r_id)
|
||||
|
||||
run_elem = OxmlElement("w:r")
|
||||
t = OxmlElement("w:t")
|
||||
t.text = "john@test.com"
|
||||
run_elem.append(t)
|
||||
hyperlink.append(run_elem)
|
||||
p._p.append(hyperlink)
|
||||
|
||||
doc.part.rels.add_relationship(
|
||||
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
|
||||
"mailto:john@test.com",
|
||||
r_id,
|
||||
is_external=True,
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
|
||||
doc.save(tmp.name)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
reopened = Document(tmp_path)
|
||||
para = reopened.tables[0].cell(0, 0).paragraphs[0]
|
||||
extractor = object.__new__(WordExtractor)
|
||||
out = extractor._parse_cell_paragraph(para, {})
|
||||
assert out == "[john@test.com](mailto:john@test.com)"
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
|
|
|
|||
Loading…
Reference in New Issue