From 519dee1bbe253a0c1971e5a4bf906c6b1209a8c0 Mon Sep 17 00:00:00 2001 From: ckstck Date: Sun, 22 Mar 2026 08:39:48 -0700 Subject: [PATCH 1/3] fix-zwsp --- api/core/rag/splitter/fixed_text_splitter.py | 5 +- ...ixed_text_splitter_invisible_delimiters.py | 192 ++++++++++++++++++ 2 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py diff --git a/api/core/rag/splitter/fixed_text_splitter.py b/api/core/rag/splitter/fixed_text_splitter.py index 7a00e8a886..27c58a5d1a 100644 --- a/api/core/rag/splitter/fixed_text_splitter.py +++ b/api/core/rag/splitter/fixed_text_splitter.py @@ -53,7 +53,10 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter) def __init__(self, fixed_separator: str = "\n\n", separators: list[str] | None = None, **kwargs: Any): """Create a new TextSplitter.""" super().__init__(**kwargs) - self._fixed_separator = codecs.decode(fixed_separator, "unicode_escape") + if "\\" in fixed_separator: + self._fixed_separator = codecs.decode(fixed_separator, "unicode_escape") + else: + self._fixed_separator = fixed_separator self._separators = separators or ["\n\n", "\n", "。", ". ", " ", ""] def split_text(self, text: str) -> list[str]: diff --git a/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py b/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py new file mode 100644 index 0000000000..fec9540040 --- /dev/null +++ b/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py @@ -0,0 +1,192 @@ +""" +Test for invisible Unicode delimiter support in FixedRecursiveCharacterTextSplitter. + +Regression test for issue #31672: +ZWSP and other invisible Unicode characters should work as delimiters. +""" + +import pytest + +from core.rag.splitter.fixed_text_splitter import FixedRecursiveCharacterTextSplitter + + +class TestInvisibleDelimiters: + """Test invisible Unicode characters as delimiters.""" + + @pytest.fixture + def base_splitter_kwargs(self): + """Common kwargs for creating splitters.""" + def length_function(texts: list[str]) -> list[int]: + return [len(text) for text in texts] + + return { + "chunk_size": 100, + "chunk_overlap": 0, + "length_function": length_function, + } + + def test_zwsp_literal_delimiter(self, base_splitter_kwargs): + """ZWSP (U+200B) should work as a literal delimiter.""" + zwsp = "\u200b" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator=zwsp, + **base_splitter_kwargs, + ) + + text = f"chunk1{zwsp}chunk2{zwsp}chunk3" + result = splitter.split_text(text) + + assert len(result) == 3 + assert result == ["chunk1", "chunk2", "chunk3"] + + def test_zwnbsp_literal_delimiter(self, base_splitter_kwargs): + """ZWNBSP (U+FEFF) should work as a literal delimiter.""" + zwnbsp = "\ufeff" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator=zwnbsp, + **base_splitter_kwargs, + ) + + text = f"chunk1{zwnbsp}chunk2{zwnbsp}chunk3" + result = splitter.split_text(text) + + assert len(result) == 3 + assert result == ["chunk1", "chunk2", "chunk3"] + + def test_invisible_separator_literal(self, base_splitter_kwargs): + """INVISIBLE SEPARATOR (U+2063) should work as a literal delimiter.""" + invisible_sep = "\u2063" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator=invisible_sep, + **base_splitter_kwargs, + ) + + text = f"chunk1{invisible_sep}chunk2{invisible_sep}chunk3" + result = splitter.split_text(text) + + assert len(result) == 3 + assert result == ["chunk1", "chunk2", "chunk3"] + + def test_word_joiner_literal(self, base_splitter_kwargs): + """WORD JOINER (U+2060) should work as a literal delimiter.""" + word_joiner = "\u2060" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator=word_joiner, + **base_splitter_kwargs, + ) + + text = f"chunk1{word_joiner}chunk2{word_joiner}chunk3" + result = splitter.split_text(text) + + assert len(result) == 3 + assert result == ["chunk1", "chunk2", "chunk3"] + + def test_ltr_mark_literal(self, base_splitter_kwargs): + """LEFT-TO-RIGHT MARK (U+200E) should work as a literal delimiter.""" + ltr_mark = "\u200e" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator=ltr_mark, + **base_splitter_kwargs, + ) + + text = f"chunk1{ltr_mark}chunk2{ltr_mark}chunk3" + result = splitter.split_text(text) + + assert len(result) == 3 + assert result == ["chunk1", "chunk2", "chunk3"] + + def test_escaped_newline_still_works(self, base_splitter_kwargs): + """Escaped newline \\n should still be decoded properly.""" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator="\\n", + **base_splitter_kwargs, + ) + + text = "chunk1\nchunk2\nchunk3" + result = splitter.split_text(text) + + assert len(result) == 3 + assert result == ["chunk1", "chunk2", "chunk3"] + + def test_escaped_tab_still_works(self, base_splitter_kwargs): + """Escaped tab \\t should still be decoded properly.""" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator="\\t", + **base_splitter_kwargs, + ) + + text = "chunk1\tchunk2\tchunk3" + result = splitter.split_text(text) + + assert len(result) == 3 + assert result == ["chunk1", "chunk2", "chunk3"] + + def test_literal_newline_works(self, base_splitter_kwargs): + """Literal newline should work without escaping.""" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator="\n", + **base_splitter_kwargs, + ) + + text = "chunk1\nchunk2\nchunk3" + result = splitter.split_text(text) + + assert len(result) == 3 + assert result == ["chunk1", "chunk2", "chunk3"] + + def test_chinese_punctuation_literal(self, base_splitter_kwargs): + """Chinese punctuation should work as literal delimiters (related to issue).""" + # Test Chinese comma + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator=",", + **base_splitter_kwargs, + ) + + text = "chunk1,chunk2,chunk3" + result = splitter.split_text(text) + + assert len(result) == 3 + assert result == ["chunk1", "chunk2", "chunk3"] + + def test_mixed_content_with_zwsp(self, base_splitter_kwargs): + """Test realistic content with ZWSP delimiters.""" + zwsp = "\u200b" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator=zwsp, + **base_splitter_kwargs, + ) + + text = f"First paragraph with some text.{zwsp}Second paragraph with more content.{zwsp}Third paragraph here." + result = splitter.split_text(text) + + assert len(result) == 3 + assert "First paragraph" in result[0] + assert "Second paragraph" in result[1] + assert "Third paragraph" in result[2] + + def test_empty_separator(self, base_splitter_kwargs): + """Empty separator should not split.""" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator="", + **base_splitter_kwargs, + ) + + text = "chunk1 chunk2 chunk3" + result = splitter.split_text(text) + + # Should not split on empty separator + assert len(result) == 1 + + def test_escaped_unicode_hex_notation(self, base_splitter_kwargs): + """Escaped Unicode hex notation \\u200b should be decoded.""" + splitter = FixedRecursiveCharacterTextSplitter( + fixed_separator="\\u200b", + **base_splitter_kwargs, + ) + + zwsp = "\u200b" + text = f"chunk1{zwsp}chunk2{zwsp}chunk3" + result = splitter.split_text(text) + + assert len(result) == 3 + assert result == ["chunk1", "chunk2", "chunk3"] From 40e9284733e0d3389c8a093aa179cd278c4c89c4 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Sun, 22 Mar 2026 15:57:51 +0000 Subject: [PATCH 2/3] [autofix.ci] apply automated fixes --- .../splitter/test_fixed_text_splitter_invisible_delimiters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py b/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py index fec9540040..b79e0f91d8 100644 --- a/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py +++ b/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py @@ -16,9 +16,10 @@ class TestInvisibleDelimiters: @pytest.fixture def base_splitter_kwargs(self): """Common kwargs for creating splitters.""" + def length_function(texts: list[str]) -> list[int]: return [len(text) for text in texts] - + return { "chunk_size": 100, "chunk_overlap": 0, From 891d8a5d99d05b570b28187d346d6887316811b2 Mon Sep 17 00:00:00 2001 From: ckstck Date: Sun, 22 Mar 2026 09:41:09 -0700 Subject: [PATCH 3/3] chore: optimized test file with gemni review --- ...ixed_text_splitter_invisible_delimiters.py | 103 +++++------------- 1 file changed, 25 insertions(+), 78 deletions(-) diff --git a/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py b/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py index b79e0f91d8..633c567b05 100644 --- a/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py +++ b/api/tests/unit_tests/core/rag/splitter/test_fixed_text_splitter_invisible_delimiters.py @@ -26,97 +26,44 @@ class TestInvisibleDelimiters: "length_function": length_function, } - def test_zwsp_literal_delimiter(self, base_splitter_kwargs): - """ZWSP (U+200B) should work as a literal delimiter.""" - zwsp = "\u200b" + @pytest.mark.parametrize( + "delimiter", + [ + pytest.param("\u200b", id="zwsp"), + pytest.param("\ufeff", id="zwnbsp"), + pytest.param("\u2063", id="invisible_separator"), + pytest.param("\u2060", id="word_joiner"), + pytest.param("\u200e", id="ltr_mark"), + ], + ) + def test_invisible_literal_delimiters(self, base_splitter_kwargs, delimiter): + """Test that various invisible Unicode characters work as literal delimiters.""" splitter = FixedRecursiveCharacterTextSplitter( - fixed_separator=zwsp, + fixed_separator=delimiter, **base_splitter_kwargs, ) - text = f"chunk1{zwsp}chunk2{zwsp}chunk3" + text = f"chunk1{delimiter}chunk2{delimiter}chunk3" result = splitter.split_text(text) assert len(result) == 3 assert result == ["chunk1", "chunk2", "chunk3"] - def test_zwnbsp_literal_delimiter(self, base_splitter_kwargs): - """ZWNBSP (U+FEFF) should work as a literal delimiter.""" - zwnbsp = "\ufeff" + @pytest.mark.parametrize( + ("escaped_char", "literal_char"), + [ + pytest.param("\\n", "\n", id="newline"), + pytest.param("\\t", "\t", id="tab"), + ], + ) + def test_escaped_chars_still_work(self, base_splitter_kwargs, escaped_char, literal_char): + """Escaped characters should still be decoded properly.""" splitter = FixedRecursiveCharacterTextSplitter( - fixed_separator=zwnbsp, + fixed_separator=escaped_char, **base_splitter_kwargs, ) - text = f"chunk1{zwnbsp}chunk2{zwnbsp}chunk3" - result = splitter.split_text(text) - - assert len(result) == 3 - assert result == ["chunk1", "chunk2", "chunk3"] - - def test_invisible_separator_literal(self, base_splitter_kwargs): - """INVISIBLE SEPARATOR (U+2063) should work as a literal delimiter.""" - invisible_sep = "\u2063" - splitter = FixedRecursiveCharacterTextSplitter( - fixed_separator=invisible_sep, - **base_splitter_kwargs, - ) - - text = f"chunk1{invisible_sep}chunk2{invisible_sep}chunk3" - result = splitter.split_text(text) - - assert len(result) == 3 - assert result == ["chunk1", "chunk2", "chunk3"] - - def test_word_joiner_literal(self, base_splitter_kwargs): - """WORD JOINER (U+2060) should work as a literal delimiter.""" - word_joiner = "\u2060" - splitter = FixedRecursiveCharacterTextSplitter( - fixed_separator=word_joiner, - **base_splitter_kwargs, - ) - - text = f"chunk1{word_joiner}chunk2{word_joiner}chunk3" - result = splitter.split_text(text) - - assert len(result) == 3 - assert result == ["chunk1", "chunk2", "chunk3"] - - def test_ltr_mark_literal(self, base_splitter_kwargs): - """LEFT-TO-RIGHT MARK (U+200E) should work as a literal delimiter.""" - ltr_mark = "\u200e" - splitter = FixedRecursiveCharacterTextSplitter( - fixed_separator=ltr_mark, - **base_splitter_kwargs, - ) - - text = f"chunk1{ltr_mark}chunk2{ltr_mark}chunk3" - result = splitter.split_text(text) - - assert len(result) == 3 - assert result == ["chunk1", "chunk2", "chunk3"] - - def test_escaped_newline_still_works(self, base_splitter_kwargs): - """Escaped newline \\n should still be decoded properly.""" - splitter = FixedRecursiveCharacterTextSplitter( - fixed_separator="\\n", - **base_splitter_kwargs, - ) - - text = "chunk1\nchunk2\nchunk3" - result = splitter.split_text(text) - - assert len(result) == 3 - assert result == ["chunk1", "chunk2", "chunk3"] - - def test_escaped_tab_still_works(self, base_splitter_kwargs): - """Escaped tab \\t should still be decoded properly.""" - splitter = FixedRecursiveCharacterTextSplitter( - fixed_separator="\\t", - **base_splitter_kwargs, - ) - - text = "chunk1\tchunk2\tchunk3" + text = f"chunk1{literal_char}chunk2{literal_char}chunk3" result = splitter.split_text(text) assert len(result) == 3