fix: fix parse mailto / http link in table cell (#33224)

Co-authored-by: Oz <oz-agent@warp.dev>
This commit is contained in:
wangxiaolei 2026-03-11 10:02:33 +08:00 committed by GitHub
parent 27f9cdedad
commit 54637144c5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 159 additions and 32 deletions

View File

@ -204,26 +204,61 @@ class WordExtractor(BaseExtractor):
return " ".join(unique_content)
def _parse_cell_paragraph(self, paragraph, image_map):
paragraph_content = []
for run in paragraph.runs:
if run.element.xpath(".//a:blip"):
for blip in run.element.xpath(".//a:blip"):
image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
if not image_id:
continue
rel = paragraph.part.rels.get(image_id)
if rel is None:
continue
# For external images, use image_id as key; for internal, use target_part
if rel.is_external:
if image_id in image_map:
paragraph_content.append(image_map[image_id])
else:
image_part = rel.target_part
if image_part in image_map:
paragraph_content.append(image_map[image_part])
else:
paragraph_content.append(run.text)
paragraph_content: list[str] = []
for child in paragraph._element:
tag = child.tag
if tag == qn("w:hyperlink"):
# Note: w:hyperlink elements may also use w:anchor for internal bookmarks.
# This extractor intentionally only converts external links (HTTP/mailto, etc.)
# that are backed by a relationship id (r:id) with rel.is_external == True.
# Hyperlinks without such an external rel (including anchor-only bookmarks)
# are left as plain text link_text.
r_id = child.get(qn("r:id"))
link_text_parts: list[str] = []
for run_elem in child.findall(qn("w:r")):
run = Run(run_elem, paragraph)
if run.text:
link_text_parts.append(run.text)
link_text = "".join(link_text_parts).strip()
if r_id:
try:
rel = paragraph.part.rels.get(r_id)
if rel:
target_ref = getattr(rel, "target_ref", None)
if target_ref:
parsed_target = urlparse(str(target_ref))
if rel.is_external or parsed_target.scheme in ("http", "https", "mailto"):
display_text = link_text or str(target_ref)
link_text = f"[{display_text}]({target_ref})"
except Exception:
logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id)
if link_text:
paragraph_content.append(link_text)
elif tag == qn("w:r"):
run = Run(child, paragraph)
if run.element.xpath(".//a:blip"):
for blip in run.element.xpath(".//a:blip"):
image_id = blip.get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
)
if not image_id:
continue
rel = paragraph.part.rels.get(image_id)
if rel is None:
continue
if rel.is_external:
if image_id in image_map:
paragraph_content.append(image_map[image_id])
else:
image_part = rel.target_part
if image_part in image_map:
paragraph_content.append(image_map[image_part])
else:
if run.text:
paragraph_content.append(run.text)
return "".join(paragraph_content).strip()
def parse_docx(self, docx_path):

View File

@ -423,15 +423,6 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch):
markdown = extractor._table_to_markdown(table, {})
assert markdown == "| H1 | H2 |\n| --- | --- |\n| A | B |"
class FakeRunElement:
def __init__(self, blips):
self._blips = blips
def xpath(self, pattern):
if pattern == ".//a:blip":
return self._blips
return []
class FakeBlip:
def __init__(self, image_id):
self.image_id = image_id
@ -439,11 +430,31 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch):
def get(self, key):
return self.image_id
class FakeRunChild:
def __init__(self, blips, text=""):
self._blips = blips
self.text = text
self.tag = qn("w:r")
def xpath(self, pattern):
if pattern == ".//a:blip":
return self._blips
return []
class FakeRun:
def __init__(self, element, paragraph):
# Mirror the subset used by _parse_cell_paragraph
self.element = element
self.text = getattr(element, "text", "")
# Patch we.Run so our lightweight child objects work with the extractor
monkeypatch.setattr(we, "Run", FakeRun)
image_part = object()
paragraph = SimpleNamespace(
runs=[
SimpleNamespace(element=FakeRunElement([FakeBlip(None), FakeBlip("ext"), FakeBlip("int")]), text=""),
SimpleNamespace(element=FakeRunElement([]), text="plain"),
_element=[
FakeRunChild([FakeBlip(None), FakeBlip("ext"), FakeBlip("int")], text=""),
FakeRunChild([], text="plain"),
],
part=SimpleNamespace(
rels={
@ -452,6 +463,7 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch):
}
),
)
image_map = {"ext": "EXT-IMG", image_part: "INT-IMG"}
assert extractor._parse_cell_paragraph(paragraph, image_map) == "EXT-IMGINT-IMGplain"
@ -625,3 +637,83 @@ def test_parse_docx_covers_drawing_shapes_hyperlink_error_and_table_branch(monke
assert "BrokenLink" in content
assert "TABLE-MARKDOWN" in content
logger_exception.assert_called_once()
def test_parse_cell_paragraph_hyperlink_in_table_cell_http():
doc = Document()
table = doc.add_table(rows=1, cols=1)
cell = table.cell(0, 0)
p = cell.paragraphs[0]
# Build modern hyperlink inside table cell
r_id = "rIdHttp1"
hyperlink = OxmlElement("w:hyperlink")
hyperlink.set(qn("r:id"), r_id)
run_elem = OxmlElement("w:r")
t = OxmlElement("w:t")
t.text = "Dify"
run_elem.append(t)
hyperlink.append(run_elem)
p._p.append(hyperlink)
# Relationship for external http link
doc.part.rels.add_relationship(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
"https://dify.ai",
r_id,
is_external=True,
)
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
doc.save(tmp.name)
tmp_path = tmp.name
try:
reopened = Document(tmp_path)
para = reopened.tables[0].cell(0, 0).paragraphs[0]
extractor = object.__new__(WordExtractor)
out = extractor._parse_cell_paragraph(para, {})
assert out == "[Dify](https://dify.ai)"
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
def test_parse_cell_paragraph_hyperlink_in_table_cell_mailto():
doc = Document()
table = doc.add_table(rows=1, cols=1)
cell = table.cell(0, 0)
p = cell.paragraphs[0]
r_id = "rIdMail1"
hyperlink = OxmlElement("w:hyperlink")
hyperlink.set(qn("r:id"), r_id)
run_elem = OxmlElement("w:r")
t = OxmlElement("w:t")
t.text = "john@test.com"
run_elem.append(t)
hyperlink.append(run_elem)
p._p.append(hyperlink)
doc.part.rels.add_relationship(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
"mailto:john@test.com",
r_id,
is_external=True,
)
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
doc.save(tmp.name)
tmp_path = tmp.name
try:
reopened = Document(tmp_path)
para = reopened.tables[0].cell(0, 0).paragraphs[0]
extractor = object.__new__(WordExtractor)
out = extractor._parse_cell_paragraph(para, {})
assert out == "[john@test.com](mailto:john@test.com)"
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)