feat: add first-class support for CJK (#8530)

2025-05-03 10:00:07 -04:00 · 2024-10-17 21:14:17 +03:00 · 2024-10-17 21:14:17 +03:00 · b479f3bd65
commit b479f3bd65
parent 21815fb930
288 changed files with 3559 additions and 918 deletions
--- a/packages/excalidraw/element/textElement.test.ts
+++ b/packages/excalidraw/element/textElement.test.ts
@ -14,20 +14,23 @@ import {
 import type { ExcalidrawTextElementWithContainer, FontString } from "./types";

 describe("Test wrapText", () => {
-  const font = "20px Cascadia, width: Segoe UI Emoji" as FontString;
+  // font is irrelevant as jsdom does not support FontFace API
+  // `measureText` width is mocked to return `text.length` by `jest-canvas-mock`
+  // https://github.com/hustcc/jest-canvas-mock/blob/master/src/classes/TextMetrics.js
+  const font = "10px Cascadia, Segoe UI Emoji" as FontString;

-  it("shouldn't add new lines for trailing spaces", () => {
-    const text = "Hello whats up     ";
-    const maxWidth = 200 - BOUND_TEXT_PADDING * 2;
-    const res = wrapText(text, font, maxWidth);
-    expect(res).toBe(text);
+  it("should wrap the text correctly when word length is exactly equal to max width", () => {
+    const text = "Hello Excalidraw";
+    // Length of "Excalidraw" is 100 and exacty equal to max width
+    const res = wrapText(text, font, 100);
+    expect(res).toEqual(`Hello\nExcalidraw`);
  });

-  it("should work with emojis", () => {
-    const text = "😀";
-    const maxWidth = 1;
-    const res = wrapText(text, font, maxWidth);
-    expect(res).toBe("😀");
+  it("should return the text as is if max width is invalid", () => {
+    const text = "Hello Excalidraw";
+    expect(wrapText(text, font, NaN)).toEqual(text);
+    expect(wrapText(text, font, -1)).toEqual(text);
+    expect(wrapText(text, font, Infinity)).toEqual(text);
  });

  it("should show the text correctly when max width reached", () => {
@ -37,6 +40,237 @@ describe("Test wrapText", () => {
    expect(res).toBe("H\ne\nl\nl\no\n😀");
  });

+  it("should not wrap number when wrapping line", () => {
+    const text = "don't wrap this number 99,100.99";
+    const maxWidth = 300;
+    const res = wrapText(text, font, maxWidth);
+    expect(res).toBe("don't wrap this number\n99,100.99");
+  });
+
+  it("should support multiple (multi-codepoint) emojis", () => {
+    const text = "😀🗺🔥👩🏽‍🦰👨‍👩‍👧‍👦🇨🇿";
+    const maxWidth = 1;
+    const res = wrapText(text, font, maxWidth);
+    expect(res).toBe("😀\n🗺\n🔥\n👩🏽‍🦰\n👨‍👩‍👧‍👦\n🇨🇿");
+  });
+
+  it("should wrap the text correctly when text contains hyphen", () => {
+    let text =
+      "Wikipedia is hosted by Wikimedia- Foundation, a non-profit organization that also hosts a range-of other projects";
+    const res = wrapText(text, font, 110);
+    expect(res).toBe(
+      `Wikipedia\nis hosted\nby\nWikimedia-\nFoundation,\na non-\nprofit\norganizatio\nn that also\nhosts a\nrange-of\nother\nprojects`,
+    );
+
+    text = "Hello thereusing-now";
+    expect(wrapText(text, font, 100)).toEqual("Hello\nthereusing\n-now");
+  });
+
+  it("should support wrapping nested lists", () => {
+    const text = `\tA) one tab\t\t- two tabs        - 8 spaces`;
+
+    const maxWidth = 100;
+    const res = wrapText(text, font, maxWidth);
+    expect(res).toBe(`\tA) one\ntab\t\t- two\ntabs\n- 8 spaces`);
+
+    const maxWidth2 = 50;
+    const res2 = wrapText(text, font, maxWidth2);
+    expect(res2).toBe(`\tA)\none\ntab\n- two\ntabs\n- 8\nspace\ns`);
+  });
+
+  describe("When text is CJK", () => {
+    it("should break each CJK character when width is very small", () => {
+      // "안녕하세요" (Hangul) + "こんにちは世界" (Hiragana, Kanji) + "ｺﾝﾆﾁハ" (Katakana) + "你好" (Han) = "Hello Hello World Hello Hi"
+      const text = "안녕하세요こんにちは世界ｺﾝﾆﾁハ你好";
+      const maxWidth = 10;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe(
+        "안\n녕\n하\n세\n요\nこ\nん\nに\nち\nは\n世\n界\nｺ\nﾝ\nﾆ\nﾁ\nハ\n你\n好",
+      );
+    });
+
+    it("should break CJK text into longer segments when width is larger", () => {
+      // "안녕하세요" (Hangul) + "こんにちは世界" (Hiragana, Kanji) + "ｺﾝﾆﾁハ" (Katakana) + "你好" (Han) = "Hello Hello World Hello Hi"
+      const text = "안녕하세요こんにちは世界ｺﾝﾆﾁハ你好";
+      const maxWidth = 30;
+      const res = wrapText(text, font, maxWidth);
+
+      // measureText is mocked, so it's not precisely what would happen in prod
+      expect(res).toBe("안녕하\n세요こ\nんにち\nは世界\nｺﾝﾆ\nﾁハ你\n好");
+    });
+
+    it("should handle a combination of CJK, latin, emojis and whitespaces", () => {
+      const text = `a醫 醫      bb  你好  world-i-😀🗺🔥`;
+
+      const maxWidth = 150;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe(`a醫 醫      bb  你\n好  world-i-😀🗺\n🔥`);
+
+      const maxWidth2 = 50;
+      const res2 = wrapText(text, font, maxWidth2);
+      expect(res2).toBe(`a醫 醫\nbb  你\n好\nworld\n-i-😀\n🗺🔥`);
+
+      const maxWidth3 = 30;
+      const res3 = wrapText(text, font, maxWidth3);
+      expect(res3).toBe(`a醫\n醫\nbb\n你好\nwor\nld-\ni-\n😀\n🗺\n🔥`);
+    });
+
+    it("should break before and after a regular CJK character", () => {
+      const text = "HelloたWorld";
+      const maxWidth1 = 50;
+      const res1 = wrapText(text, font, maxWidth1);
+      expect(res1).toBe("Hello\nた\nWorld");
+
+      const maxWidth2 = 60;
+      const res2 = wrapText(text, font, maxWidth2);
+      expect(res2).toBe("Helloた\nWorld");
+    });
+
+    it("should break before and after certain CJK symbols", () => {
+      const text = "こんにちは〃世界";
+      const maxWidth1 = 50;
+      const res1 = wrapText(text, font, maxWidth1);
+      expect(res1).toBe("こんにちは\n〃世界");
+
+      const maxWidth2 = 60;
+      const res2 = wrapText(text, font, maxWidth2);
+      expect(res2).toBe("こんにちは〃\n世界");
+    });
+
+    it("should break after, not before for certain CJK pairs", () => {
+      const text = "Hello た。";
+      const maxWidth = 70;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe("Hello\nた。");
+    });
+
+    it("should break before, not after for certain CJK pairs", () => {
+      const text = "Hello「たWorld」";
+      const maxWidth = 60;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe("Hello\n「た\nWorld」");
+    });
+
+    it("should break after, not before for certain CJK character pairs", () => {
+      const text = "「Helloた」World";
+      const maxWidth = 70;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe("「Hello\nた」World");
+    });
+
+    it("should break Chinese sentences", () => {
+      const text = `中国你好！这是一个测试。
+我们来看看：人民币¥1234「很贵」
+（括号）、逗号，句号。空格 换行　全角符号…—`;
+
+      const maxWidth1 = 80;
+      const res1 = wrapText(text, font, maxWidth1);
+      expect(res1).toBe(`中国你好！这是一\n个测试。
+我们来看看：人民\n币¥1234「很\n贵」
+（括号）、逗号，\n句号。空格 换行\n全角符号…—`);
+
+      const maxWidth2 = 50;
+      const res2 = wrapText(text, font, maxWidth2);
+      expect(res2).toBe(`中国你好！\n这是一个测\n试。
+我们来看\n看：人民币\n¥1234\n「很贵」
+（括号）、\n逗号，句\n号。空格\n换行　全角\n符号…—`);
+    });
+  });
+
+  it("should break Japanese sentences", () => {
+    const text = `日本こんにちは！これはテストです。
+  見てみましょう：円￥1234「高い」
+  （括弧）、読点、句点。
+  空白 改行　全角記号…ー`;
+
+    const maxWidth1 = 80;
+    const res1 = wrapText(text, font, maxWidth1);
+    expect(res1).toBe(`日本こんにちは！\nこれはテストで\nす。
+  見てみましょ\nう：円￥1234\n「高い」
+  （括弧）、読\n点、句点。
+  空白 改行\n全角記号…ー`);
+
+    const maxWidth2 = 50;
+    const res2 = wrapText(text, font, maxWidth2);
+    expect(res2).toBe(`日本こんに\nちは！これ\nはテストで\nす。
+  見てみ\nましょう：\n円\n￥1234\n「高い」
+  （括\n弧）、読\n点、句点。
+  空白\n改行　全角\n記号…ー`);
+  });
+
+  it("should break Korean sentences", () => {
+    const text = `한국 안녕하세요! 이것은 테스트입니다.
+우리 보자: 원화₩1234「비싸다」
+(괄호), 쉼표, 마침표.
+공백 줄바꿈　전각기호…—`;
+
+    const maxWidth1 = 80;
+    const res1 = wrapText(text, font, maxWidth1);
+    expect(res1).toBe(`한국 안녕하세\n요! 이것은 테\n스트입니다.
+우리 보자: 원\n화₩1234「비\n싸다」
+(괄호), 쉼\n표, 마침표.
+공백 줄바꿈　전\n각기호…—`);
+
+    const maxWidth2 = 60;
+    const res2 = wrapText(text, font, maxWidth2);
+    expect(res2).toBe(`한국 안녕하\n세요! 이것\n은 테스트입\n니다.
+우리 보자:\n원화\n₩1234\n「비싸다」
+(괄호),\n쉼표, 마침\n표.
+공백 줄바꿈\n전각기호…—`);
+  });
+
+  describe("When text contains leading whitespaces", () => {
+    const text = "  \t   Hello world";
+
+    it("should preserve leading whitespaces", () => {
+      const maxWidth = 120;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe("  \t   Hello\nworld");
+    });
+
+    it("should break and collapse leading whitespaces when line breaks", () => {
+      const maxWidth = 60;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe("\nHello\nworld");
+    });
+
+    it("should break and collapse leading whitespaces whe words break", () => {
+      const maxWidth = 30;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe("\nHel\nlo\nwor\nld");
+    });
+  });
+
+  describe("When text contains trailing whitespaces", () => {
+    it("shouldn't add new lines for trailing spaces", () => {
+      const text = "Hello whats up     ";
+      const maxWidth = 200 - BOUND_TEXT_PADDING * 2;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe(text);
+    });
+
+    it("should ignore trailing whitespaces when line breaks", () => {
+      const text = "Hippopotomonstrosesquippedaliophobia        ??????";
+      const maxWidth = 400;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe("Hippopotomonstrosesquippedaliophobia\n??????");
+    });
+
+    it("should not ignore trailing whitespaces when word breaks", () => {
+      const text = "Hippopotomonstrosesquippedaliophobia        ??????";
+      const maxWidth = 300;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe("Hippopotomonstrosesquippedalio\nphobia        ??????");
+    });
+
+    it("should ignore trailing whitespaces when word breaks and line breaks", () => {
+      const text = "Hippopotomonstrosesquippedaliophobia        ??????";
+      const maxWidth = 180;
+      const res = wrapText(text, font, maxWidth);
+      expect(res).toBe("Hippopotomonstrose\nsquippedaliophobia\n??????");
+    });
+  });
+
  describe("When text doesn't contain new lines", () => {
    const text = "Hello whats up";

@ -44,7 +278,7 @@ describe("Test wrapText", () => {
      {
        desc: "break all words when width of each word is less than container width",
        width: 80,
-        res: `Hello \nwhats \nup`,
+        res: `Hello\nwhats\nup`,
      },
      {
        desc: "break all characters when width of each character is less than container width",
@ -66,7 +300,7 @@ p`,
        desc: "break words as per the width",

        width: 140,
-        res: `Hello whats \nup`,
+        res: `Hello whats\nup`,
      },
      {
        desc: "fit the container",
@ -96,7 +330,7 @@ whats up`;
      {
        desc: "break all words when width of each word is less than container width",
        width: 80,
-        res: `Hello\nwhats \nup`,
+        res: `Hello\nwhats\nup`,
      },
      {
        desc: "break all characters when width of each character is less than container width",
@ -142,26 +376,24 @@ whats up`,
      {
        desc: "fit characters of long string as per container width",
        width: 170,
-        res: `hellolongtextth\nisiswhatsupwith\nyouIamtypingggg\ngandtypinggg \nbreak it now`,
+        res: `hellolongtextthi\nsiswhatsupwithyo\nuIamtypingggggan\ndtypinggg break\nit now`,
      },
-
      {
        desc: "fit characters of long string as per container width and break words as per the width",

        width: 130,
-        res: `hellolongte
-xtthisiswha
-tsupwithyou
-Iamtypinggg
-ggandtyping
-gg break it
-now`,
+        res: `hellolongtex
+tthisiswhats
+upwithyouIam
+typingggggan
+dtypinggg
+break it now`,
      },
      {
        desc: "fit the long text when container width is greater than text length and move the rest to next line",

        width: 600,
-        res: `hellolongtextthisiswhatsupwithyouIamtypingggggandtypinggg \nbreak it now`,
+        res: `hellolongtextthisiswhatsupwithyouIamtypingggggandtypinggg\nbreak it now`,
      },
    ].forEach((data) => {
      it(`should ${data.desc}`, () => {
@ -171,68 +403,243 @@ now`,
    });
  });

-  it("should wrap the text correctly when word length is exactly equal to max width", () => {
-    const text = "Hello Excalidraw";
-    // Length of "Excalidraw" is 100 and exacty equal to max width
-    const res = wrapText(text, font, 100);
-    expect(res).toEqual(`Hello \nExcalidraw`);
-  });
+  describe("Test parseTokens", () => {
+    it("should tokenize latin", () => {
+      let text = "Excalidraw is a virtual collaborative whiteboard";

-  it("should return the text as is if max width is invalid", () => {
-    const text = "Hello Excalidraw";
-    expect(wrapText(text, font, NaN)).toEqual(text);
-    expect(wrapText(text, font, -1)).toEqual(text);
-    expect(wrapText(text, font, Infinity)).toEqual(text);
-  });
+      expect(parseTokens(text)).toEqual([
+        "Excalidraw",
+        " ",
+        "is",
+        " ",
+        "a",
+        " ",
+        "virtual",
+        " ",
+        "collaborative",
+        " ",
+        "whiteboard",
+      ]);

-  it("should wrap the text correctly when text contains hyphen", () => {
-    let text =
-      "Wikipedia is hosted by Wikimedia- Foundation, a non-profit organization that also hosts a range-of other projects";
-    const res = wrapText(text, font, 110);
-    expect(res).toBe(
-      `Wikipedia \nis hosted \nby \nWikimedia-\nFoundation,\na non-\nprofit \norganizati\non that \nalso hosts\na range-of\nother \nprojects`,
-    );
+      text =
+        "Wikipedia is hosted by Wikimedia- Foundation, a non-profit organization that also hosts a range-of other projects";
+      expect(parseTokens(text)).toEqual([
+        "Wikipedia",
+        " ",
+        "is",
+        " ",
+        "hosted",
+        " ",
+        "by",
+        " ",
+        "Wikimedia-",
+        " ",
+        "Foundation,",
+        " ",
+        "a",
+        " ",
+        "non-",
+        "profit",
+        " ",
+        "organization",
+        " ",
+        "that",
+        " ",
+        "also",
+        " ",
+        "hosts",
+        " ",
+        "a",
+        " ",
+        "range-",
+        "of",
+        " ",
+        "other",
+        " ",
+        "projects",
+      ]);
+    });

-    text = "Hello thereusing-now";
-    expect(wrapText(text, font, 100)).toEqual("Hello \nthereusin\ng-now");
-  });
-});
+    it("should not tokenize number", () => {
+      const text = "99,100.99";
+      const tokens = parseTokens(text);
+      expect(tokens).toEqual(["99,100.99"]);
+    });

-describe("Test parseTokens", () => {
-  it("should split into tokens correctly", () => {
-    let text = "Excalidraw is a virtual collaborative whiteboard";
-    expect(parseTokens(text)).toEqual([
-      "Excalidraw",
-      "is",
-      "a",
-      "virtual",
-      "collaborative",
-      "whiteboard",
-    ]);
+    it("should tokenize joined emojis", () => {
+      const text = `😬🌍🗺🔥☂️👩🏽‍🦰👨‍👩‍👧‍👦👩🏾‍🔬🏳️‍🌈🧔‍♀️🧑‍🤝‍🧑🙅🏽‍♂️✅0️⃣🇨🇿🦅`;
+      const tokens = parseTokens(text);

-    text =
-      "Wikipedia is hosted by Wikimedia- Foundation, a non-profit organization that also hosts a range-of other projects";
-    expect(parseTokens(text)).toEqual([
-      "Wikipedia",
-      "is",
-      "hosted",
-      "by",
-      "Wikimedia-",
-      "",
-      "Foundation,",
-      "a",
-      "non-",
-      "profit",
-      "organization",
-      "that",
-      "also",
-      "hosts",
-      "a",
-      "range-",
-      "of",
-      "other",
-      "projects",
-    ]);
+      expect(tokens).toEqual([
+        "😬",
+        "🌍",
+        "🗺",
+        "🔥",
+        "☂️",
+        "👩🏽‍🦰",
+        "👨‍👩‍👧‍👦",
+        "👩🏾‍🔬",
+        "🏳️‍🌈",
+        "🧔‍♀️",
+        "🧑‍🤝‍🧑",
+        "🙅🏽‍♂️",
+        "✅",
+        "0️⃣",
+        "🇨🇿",
+        "🦅",
+      ]);
+    });
+
+    it("should tokenize emojis mixed with mixed text", () => {
+      const text = `😬a🌍b🗺c🔥d☂️《👩🏽‍🦰》👨‍👩‍👧‍👦德👩🏾‍🔬こ🏳️‍🌈안🧔‍♀️g🧑‍🤝‍🧑h🙅🏽‍♂️e✅f0️⃣g🇨🇿10🦅#hash`;
+      const tokens = parseTokens(text);
+
+      expect(tokens).toEqual([
+        "😬",
+        "a",
+        "🌍",
+        "b",
+        "🗺",
+        "c",
+        "🔥",
+        "d",
+        "☂️",
+        "《",
+        "👩🏽‍🦰",
+        "》",
+        "👨‍👩‍👧‍👦",
+        "德",
+        "👩🏾‍🔬",
+        "こ",
+        "🏳️‍🌈",
+        "안",
+        "🧔‍♀️",
+        "g",
+        "🧑‍🤝‍🧑",
+        "h",
+        "🙅🏽‍♂️",
+        "e",
+        "✅",
+        "f0️⃣g", // bummer, but ok, as we traded kecaps not breaking (less common) for hash and numbers not breaking (more common)
+        "🇨🇿",
+        "10", // nice! do not break the number, as it's by default matched by \p{Emoji}
+        "🦅",
+        "#hash", // nice! do not break the hash, as it's by default matched by \p{Emoji}
+      ]);
+    });
+
+    it("should tokenize decomposed chars into their composed variants", () => {
+      // each input character is in a decomposed form
+      const text = "čでäぴέ다й한";
+      expect(text.normalize("NFC").length).toEqual(8);
+      expect(text).toEqual(text.normalize("NFD"));
+
+      const tokens = parseTokens(text);
+      expect(tokens.length).toEqual(8);
+      expect(tokens).toEqual(["č", "で", "ä", "ぴ", "έ", "다", "й", "한"]);
+    });
+
+    it("should tokenize artificial CJK", () => {
+      const text = `《道德經》醫-醫こんにちは世界！안녕하세요세계；다.다...원/달(((다)))[[1]]〚({((한))>)〛た…[Hello] World？ニューヨーク・￥3700.55す。090-1234-5678￥1,000〜＄5,000「素晴らしい！」〔重要〕＃１：Taro君30％は、（たなばた）〰￥110±￥570で20℃〜9:30〜10:00【一番】`;
+
+      // [
+      //   '《道',        '德',             '經》',    '醫-',
+      //   '醫',          'こ',             'ん',      'に',
+      //   'ち',          'は',             '世',      '界！',
+      //   '안',          '녕',             '하',      '세',
+      //   '요',          '세',             '계；',    '다.',
+      //   '다...',       '원/',            '달',      '(((다)))',
+      //   '[[1]]',       '〚({((한))>)〛', 'た…',     '[Hello]',
+      //   ' ',           'World？',        'ニ',      'ュ',
+      //   'ー',          'ヨ',             'ー',      'ク・',
+      //   '￥3700.55',   'す。',           '090-',    '1234-',
+      //   '5678￥1,000', '〜',             '＄5,000', '「素',
+      //   '晴',          'ら',             'し',      'い！」',
+      //   '〔重',        '要〕',           '＃',      '１：',
+      //   'Taro',        '君',             '30％',    'は、',
+      //   '（た',        'な',             'ば',      'た）',
+      //   '〰',          '￥110±',         '￥570',   'で',
+      //   '20℃',         '〜',             '9:30',    '〜',
+      //   '10:00',       '【一',           '番】'
+      // ]
+      const tokens = parseTokens(text);
+
+      // Latin
+      expect(tokens).toContain("[[1]]");
+      expect(tokens).toContain("[Hello]");
+      expect(tokens).toContain("World？");
+      expect(tokens).toContain("Taro");
+
+      // Chinese
+      expect(tokens).toContain("《道");
+      expect(tokens).toContain("德");
+      expect(tokens).toContain("經》");
+      expect(tokens).toContain("醫-");
+      expect(tokens).toContain("醫");
+
+      // Japanese
+      expect(tokens).toContain("こ");
+      expect(tokens).toContain("ん");
+      expect(tokens).toContain("に");
+      expect(tokens).toContain("ち");
+      expect(tokens).toContain("は");
+      expect(tokens).toContain("世");
+      expect(tokens).toContain("ニ");
+      expect(tokens).toContain("ク・");
+      expect(tokens).toContain("界！");
+      expect(tokens).toContain("た…");
+      expect(tokens).toContain("す。");
+      expect(tokens).toContain("ュ");
+      expect(tokens).toContain("ー");
+      expect(tokens).toContain("「素");
+      expect(tokens).toContain("晴");
+      expect(tokens).toContain("ら");
+      expect(tokens).toContain("し");
+      expect(tokens).toContain("い！」");
+      expect(tokens).toContain("君");
+      expect(tokens).toContain("は、");
+      expect(tokens).toContain("（た");
+      expect(tokens).toContain("な");
+      expect(tokens).toContain("ば");
+      expect(tokens).toContain("た）");
+      expect(tokens).toContain("で");
+      expect(tokens).toContain("【一");
+      expect(tokens).toContain("番】");
+
+      // Check for Korean
+      expect(tokens).toContain("안");
+      expect(tokens).toContain("녕");
+      expect(tokens).toContain("하");
+      expect(tokens).toContain("세");
+      expect(tokens).toContain("요");
+      expect(tokens).toContain("세");
+      expect(tokens).toContain("계；");
+      expect(tokens).toContain("다.");
+      expect(tokens).toContain("다...");
+      expect(tokens).toContain("원/");
+      expect(tokens).toContain("달");
+      expect(tokens).toContain("(((다)))");
+      expect(tokens).toContain("〚({((한))>)〛");
+
+      // Numbers and units
+      expect(tokens).toContain("￥3700.55");
+      expect(tokens).toContain("090-");
+      expect(tokens).toContain("1234-");
+      expect(tokens).toContain("5678￥1,000");
+      expect(tokens).toContain("＄5,000");
+      expect(tokens).toContain("１：");
+      expect(tokens).toContain("30％");
+      expect(tokens).toContain("￥110±");
+      expect(tokens).toContain("￥570");
+      expect(tokens).toContain("20℃");
+      expect(tokens).toContain("9:30");
+      expect(tokens).toContain("10:00");
+
+      // Punctuation and symbols
+      expect(tokens).toContain("〜");
+      expect(tokens).toContain("〰");
+      expect(tokens).toContain("＃");
+    });
  });
 });

--- a/packages/excalidraw/element/textElement.ts
+++ b/packages/excalidraw/element/textElement.ts
@ -16,6 +16,7 @@ import {
  BOUND_TEXT_PADDING,
  DEFAULT_FONT_FAMILY,
  DEFAULT_FONT_SIZE,
+  ENV,
  TEXT_ALIGN,
  VERTICAL_ALIGN,
 } from "../constants";
@ -30,6 +31,172 @@ import {
 } from "./containerCache";
 import type { ExtractSetType } from "../utility-types";

+/**
+ * Matches various emoji types.
+ *
+ * 1. basic emojis (😀, 🌍)
+ * 2. flags (🇨🇿)
+ * 3. multi-codepoint emojis:
+ *    - skin tones (👍🏽)
+ *    - variation selectors (☂️)
+ *    - keycaps (1️⃣)
+ *    - tag sequences (🏴󠁧󠁢󠁥󠁮󠁧󠁿)
+ *    - emoji sequences (👨‍👩‍👧‍👦, 👩‍🚀, 🏳️‍🌈)
+ *
+ * Unicode points:
+ * - \uFE0F: presentation selector
+ * - \u20E3: enclosing keycap
+ * - \u200D: ZWJ (zero width joiner)
+ * - \u{E0020}-\u{E007E}: tags
+ * - \u{E007F}: cancel tag
+ *
+ * @see https://unicode.org/reports/tr51/#EBNF_and_Regex, with changes:
+ * - replaced \p{Emoji} with [\p{Extended_Pictographic}\p{Emoji_Presentation}], see more in `should tokenize emojis mixed with mixed text` test
+ * - replaced \p{Emod} with \p{Emoji_Modifier} as some do not understand the abbreviation (i.e. https://devina.io/redos-checker)
+ */
+const _EMOJI_CHAR =
+  /(\p{RI}\p{RI}|[\p{Extended_Pictographic}\p{Emoji_Presentation}](?:\p{Emoji_Modifier}|\uFE0F\u20E3?|[\u{E0020}-\u{E007E}]+\u{E007F})?(?:\u200D(?:\p{RI}\p{RI}|[\p{Emoji}](?:\p{Emoji_Modifier}|\uFE0F\u20E3?|[\u{E0020}-\u{E007E}]+\u{E007F})?))*)/u;
+
+/**
+ * Detect a CJK char, though does not include every possible char used in CJK texts,
+ * such as symbols and punctuations.
+ *
+ * By default every CJK is a breaking point, though CJK has additional breaking points,
+ * including full width punctuations or symbols (Chinese and Japanese) and western punctuations (Korean).
+ *
+ * Additional CJK breaking point rules:
+ * - expect a break before (lookahead), but not after (negative lookbehind),  i.e. "(" or "("
+ * - expect a break after (lookbehind), but not before (negative lookahead), i.e. "）" or ")"
+ * - expect a break always (lookahead and lookbehind), i.e. "〃"
+ */
+const _CJK_CHAR =
+  /\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}/u;
+
+/**
+ * Following characters break only with CJK, not with alphabetic characters.
+ * This is essential for Korean, as it uses alphabetic punctuation, but expects CJK-like breaking points.
+ *
+ * Hello((た)) → ["Hello", "((た))"]
+ * Hello((World)) → ["Hello((World))"]
+ */
+const _CJK_BREAK_NOT_AFTER_BUT_BEFORE = /<\(\[\{/u;
+const _CJK_BREAK_NOT_BEFORE_BUT_AFTER = />\)\]\}.,:;\?!/u;
+const _CJK_BREAK_ALWAYS = /　〃〜～〰＃＆＊＋－ー／＝｜￢￣￤/u;
+const _CJK_SYMBOLS_AND_PUNCTUATION =
+  /（）［］｛｝〈〉《》｟｠｢｣「」『』【】〖〗〔〕〘〙〚〛＜＞〝〞＇〟・。ﾟﾞ，、．：；？！％ー/u;
+
+/**
+ * Following characters break with any character, even though are mostly used with CJK.
+ *
+ * Hello た。→ ["Hello", "た。"]
+ *        ↑ DON'T BREAK "た。" (negative lookahead)
+ * Hello「た」 World → ["Hello", "「た」", "World"]
+ *       ↑ DON'T BREAK "「た" (negative lookbehind)
+ *        ↑ DON'T BREAK "た」"(negative lookahead)
+ *      ↑ BREAK BEFORE "「" (lookahead)
+ *         ↑ BREAK AFTER "」" (lookbehind)
+ */
+const _ANY_BREAK_NOT_AFTER_BUT_BEFORE = /（［｛〈《｟｢「『【〖〔〘〚＜〝/u;
+const _ANY_BREAK_NOT_BEFORE_BUT_AFTER =
+  /）］｝〉》｠｣」』】〗〕〙〛＞〞＇〟・。ﾟﾞ，、．：；？！％±‥…\//u;
+
+/**
+ * Natural breaking points for any grammars.
+ *
+ * Hello-world
+ *       ↑ BREAK AFTER "-" → ["Hello-", "world"]
+ * Hello world
+ *      ↑ BREAK ALWAYS " " → ["Hello", " ", "world"]
+ */
+const _ANY_BREAK_AFTER = /-/u;
+const _ANY_BREAK_ALWAYS = /\s/u;
+
+/**
+ * Simple fallback for browsers (mainly Safari < 16.4) that don't support "Lookbehind assertion".
+ *
+ * Browser support as of 10/2024:
+ * - 91% Lookbehind assertion https://caniuse.com/mdn-javascript_regular_expressions_lookbehind_assertion
+ * - 94% Unicode character class escape https://caniuse.com/mdn-javascript_regular_expressions_unicode_character_class_escape
+ *
+ * Does not include advanced CJK breaking rules, but covers most of the core cases, especially for latin.
+ */
+const BREAK_LINE_REGEX_SIMPLE = new RegExp(
+  `${_EMOJI_CHAR.source}|([${_ANY_BREAK_ALWAYS.source}${_CJK_CHAR.source}${_CJK_BREAK_ALWAYS.source}${_ANY_BREAK_AFTER.source}])`,
+  "u",
+);
+
+// Hello World → ["Hello", " World"]
+//      ↑ BREAK BEFORE " "
+// HelloたWorld → ["Hello", "たWorld"]
+//      ↑ BREAK BEFORE "た"
+// Hello「World」→ ["Hello", "「World」"]
+//      ↑ BREAK BEFORE "「"
+const getLookaheadBreakingPoints = () => {
+  const ANY_BREAKING_POINT = `(?<![${_ANY_BREAK_NOT_AFTER_BUT_BEFORE.source}])(?=[${_ANY_BREAK_NOT_AFTER_BUT_BEFORE.source}${_ANY_BREAK_ALWAYS.source}])`;
+  const CJK_BREAKING_POINT = `(?<![${_ANY_BREAK_NOT_AFTER_BUT_BEFORE.source}${_CJK_BREAK_NOT_AFTER_BUT_BEFORE.source}])(?=[${_CJK_BREAK_NOT_AFTER_BUT_BEFORE.source}]*[${_CJK_CHAR.source}${_CJK_BREAK_ALWAYS.source}])`;
+  return new RegExp(`(?:${ANY_BREAKING_POINT}|${CJK_BREAKING_POINT})`, "u");
+};
+
+// Hello World → ["Hello ", "World"]
+//       ↑ BREAK AFTER " "
+// Hello-World → ["Hello-", "World"]
+//       ↑ BREAK AFTER "-"
+// HelloたWorld → ["Helloた", "World"]
+//       ↑ BREAK AFTER "た"
+//「Hello」World → ["「Hello」", "World"]
+//       ↑ BREAK AFTER "」"
+const getLookbehindBreakingPoints = () => {
+  const ANY_BREAKING_POINT = `(?![${_ANY_BREAK_NOT_BEFORE_BUT_AFTER.source}])(?<=[${_ANY_BREAK_NOT_BEFORE_BUT_AFTER.source}${_ANY_BREAK_ALWAYS.source}${_ANY_BREAK_AFTER.source}])`;
+  const CJK_BREAKING_POINT = `(?![${_ANY_BREAK_NOT_BEFORE_BUT_AFTER.source}${_CJK_BREAK_NOT_BEFORE_BUT_AFTER.source}${_ANY_BREAK_AFTER.source}])(?<=[${_CJK_CHAR.source}${_CJK_BREAK_ALWAYS.source}][${_CJK_BREAK_NOT_BEFORE_BUT_AFTER.source}]*)`;
+  return new RegExp(`(?:${ANY_BREAKING_POINT}|${CJK_BREAKING_POINT})`, "u");
+};
+
+/**
+ * Break a line based on the whitespaces, CJK / emoji chars and language specific breaking points,
+ * like hyphen for alphabetic and various full-width codepoints for CJK - especially Japanese, e.g.:
+ *
+ *  "Hello 世界。🌎🗺" → ["Hello", " ", "世", "界。", "🌎", "🗺"]
+ *  "Hello-world" → ["Hello-", "world"]
+ *  "「Hello World」" → ["「Hello", " ", "World」"]
+ */
+const getBreakLineRegexAdvanced = () =>
+  new RegExp(
+    `${_EMOJI_CHAR.source}|${getLookaheadBreakingPoints().source}|${
+      getLookbehindBreakingPoints().source
+    }`,
+    "u",
+  );
+
+let cachedBreakLineRegex: RegExp | undefined;
+
+// Lazy-load for browsers that don't support "Lookbehind assertion"
+const getBreakLineRegex = () => {
+  if (!cachedBreakLineRegex) {
+    try {
+      cachedBreakLineRegex = getBreakLineRegexAdvanced();
+    } catch {
+      cachedBreakLineRegex = BREAK_LINE_REGEX_SIMPLE;
+    }
+  }
+
+  return cachedBreakLineRegex;
+};
+
+const CJK_REGEX = new RegExp(
+  `[${_CJK_CHAR.source}${_CJK_BREAK_ALWAYS.source}${_CJK_SYMBOLS_AND_PUNCTUATION.source}]`,
+  "u",
+);
+
+const EMOJI_REGEX = new RegExp(`${_EMOJI_CHAR.source}`, "u");
+
+export const containsCJK = (text: string) => {
+  return CJK_REGEX.test(text);
+};
+
+export const containsEmoji = (text: string) => {
+  return EMOJI_REGEX.test(text);
+};
+
 export const normalizeText = (text: string) => {
  return (
    normalizeEOL(text)
@ -408,22 +575,132 @@ export const getTextHeight = (
  return getLineHeightInPx(fontSize, lineHeight) * lineCount;
 };

-export const parseTokens = (text: string) => {
-  // Splitting words containing "-" as those are treated as separate words
-  // by css wrapping algorithm eg non-profit => non-, profit
-  const words = text.split("-");
-  if (words.length > 1) {
-    // non-proft org => ['non-', 'profit org']
-    words.forEach((word, index) => {
-      if (index !== words.length - 1) {
-        words[index] = word += "-";
-      }
-    });
+export const parseTokens = (line: string) => {
+  const breakLineRegex = getBreakLineRegex();
+
+  // normalizing to single-codepoint composed chars due to canonical equivalence of multi-codepoint versions for chars like č, で (~ so that we don't break a line in between c and ˇ)
+  // filtering due to multi-codepoint chars like 👨‍👩‍👧‍👦, 👩🏽‍🦰
+  return line.normalize("NFC").split(breakLineRegex).filter(Boolean);
+};
+
+// handles multi-byte chars (é, 中) and purposefully does not handle multi-codepoint char (👨‍👩‍👧‍👦, 👩🏽‍🦰)
+const isSingleCharacter = (maybeSingleCharacter: string) => {
+  return (
+    maybeSingleCharacter.codePointAt(0) !== undefined &&
+    maybeSingleCharacter.codePointAt(1) === undefined
+  );
+};
+
+const satisfiesWordInvariant = (word: string) => {
+  if (import.meta.env.MODE === ENV.TEST || import.meta.env.DEV) {
+    if (/\s/.test(word)) {
+      throw new Error("Word should not contain any whitespaces!");
+    }
  }
-  // Joining the words with space and splitting them again with space to get the
-  // final list of tokens
-  // ['non-', 'profit org'] =>,'non- proft org' => ['non-','profit','org']
-  return words.join(" ").split(" ");
+};
+
+const wrapWord = (
+  word: string,
+  font: FontString,
+  maxWidth: number,
+): Array<string> => {
+  // multi-codepoint emojis are already broken apart and shouldn't be broken further
+  if (EMOJI_REGEX.test(word)) {
+    return [word];
+  }
+
+  satisfiesWordInvariant(word);
+
+  const lines: Array<string> = [];
+  const chars = Array.from(word);
+
+  let currentLine = "";
+  let currentLineWidth = 0;
+
+  for (const char of chars) {
+    const _charWidth = charWidth.calculate(char, font);
+    const testLineWidth = currentLineWidth + _charWidth;
+
+    if (testLineWidth <= maxWidth) {
+      currentLine = currentLine + char;
+      currentLineWidth = testLineWidth;
+      continue;
+    }
+
+    if (currentLine) {
+      lines.push(currentLine);
+    }
+
+    currentLine = char;
+    currentLineWidth = _charWidth;
+  }
+
+  if (currentLine) {
+    lines.push(currentLine);
+  }
+
+  return lines;
+};
+
+const wrapLine = (
+  line: string,
+  font: FontString,
+  maxWidth: number,
+): string[] => {
+  const lines: Array<string> = [];
+  const tokens = parseTokens(line);
+  const tokenIterator = tokens[Symbol.iterator]();
+
+  let currentLine = "";
+  let currentLineWidth = 0;
+
+  let iterator = tokenIterator.next();
+
+  while (!iterator.done) {
+    const token = iterator.value;
+    const testLine = currentLine + token;
+
+    // cache single codepoint whitespace, CJK or emoji width calc. as kerning should not apply here
+    const testLineWidth = isSingleCharacter(token)
+      ? currentLineWidth + charWidth.calculate(token, font)
+      : getLineWidth(testLine, font, true);
+
+    // build up the current line, skipping length check for possibly trailing whitespaces
+    if (/\s/.test(token) || testLineWidth <= maxWidth) {
+      currentLine = testLine;
+      currentLineWidth = testLineWidth;
+      iterator = tokenIterator.next();
+      continue;
+    }
+
+    // current line is empty => just the token (word) is longer than `maxWidth` and needs to be wrapped
+    if (!currentLine) {
+      const wrappedWord = wrapWord(token, font, maxWidth);
+      const trailingLine = wrappedWord[wrappedWord.length - 1] ?? "";
+      const precedingLines = wrappedWord.slice(0, -1);
+
+      lines.push(...precedingLines);
+
+      // trailing line of the wrapped word might still be joined with next token/s
+      currentLine = trailingLine;
+      currentLineWidth = getLineWidth(trailingLine, font, true);
+      iterator = tokenIterator.next();
+    } else {
+      // push & reset, but don't iterate on the next token, as we didn't use it yet!
+      lines.push(currentLine.trimEnd());
+
+      // purposefully not iterating and not setting `currentLine` to `token`, so that we could use a simple !currentLine check above
+      currentLine = "";
+      currentLineWidth = 0;
+    }
+  }
+
+  // iterator done, push the trailing line if exists
+  if (currentLine) {
+    lines.push(currentLine.trimEnd());
+  }
+
+  return lines;
 };

 export const wrapText = (
@ -440,134 +717,17 @@ export const wrapText = (

  const lines: Array<string> = [];
  const originalLines = text.split("\n");
-  const spaceAdvanceWidth = getLineWidth(" ", font, true);
-
-  let currentLine = "";
-  let currentLineWidthTillNow = 0;
-
-  const push = (str: string) => {
-    if (str.trim()) {
-      lines.push(str);
-    }
-  };
-
-  const resetParams = () => {
-    currentLine = "";
-    currentLineWidthTillNow = 0;
-  };

  for (const originalLine of originalLines) {
    const currentLineWidth = getLineWidth(originalLine, font, true);

-    // Push the line if its <= maxWidth
    if (currentLineWidth <= maxWidth) {
      lines.push(originalLine);
      continue;
    }

-    const words = parseTokens(originalLine);
-    resetParams();
-
-    let index = 0;
-
-    while (index < words.length) {
-      const currentWordWidth = getLineWidth(words[index], font, true);
-
-      // This will only happen when single word takes entire width
-      if (currentWordWidth === maxWidth) {
-        push(words[index]);
-        index++;
-      }
-
-      // Start breaking longer words exceeding max width
-      else if (currentWordWidth > maxWidth) {
-        // push current line since the current word exceeds the max width
-        // so will be appended in next line
-        push(currentLine);
-
-        resetParams();
-
-        while (words[index].length > 0) {
-          const currentChar = String.fromCodePoint(
-            words[index].codePointAt(0)!,
-          );
-
-          const line = currentLine + currentChar;
-          // use advance width instead of the actual width as it's closest to the browser wapping algo
-          // use width of the whole line instead of calculating individual chars to accomodate for kerning
-          const lineAdvanceWidth = getLineWidth(line, font, true);
-          const charAdvanceWidth = charWidth.calculate(currentChar, font);
-
-          currentLineWidthTillNow = lineAdvanceWidth;
-          words[index] = words[index].slice(currentChar.length);
-
-          if (currentLineWidthTillNow >= maxWidth) {
-            push(currentLine);
-            currentLine = currentChar;
-            currentLineWidthTillNow = charAdvanceWidth;
-          } else {
-            currentLine = line;
-          }
-        }
-        // push current line if appending space exceeds max width
-        if (currentLineWidthTillNow + spaceAdvanceWidth >= maxWidth) {
-          push(currentLine);
-          resetParams();
-          // space needs to be appended before next word
-          // as currentLine contains chars which couldn't be appended
-          // to previous line unless the line ends with hyphen to sync
-          // with css word-wrap
-        } else if (!currentLine.endsWith("-")) {
-          currentLine += " ";
-          currentLineWidthTillNow += spaceAdvanceWidth;
-        }
-        index++;
-      } else {
-        // Start appending words in a line till max width reached
-        while (currentLineWidthTillNow < maxWidth && index < words.length) {
-          const word = words[index];
-          currentLineWidthTillNow = getLineWidth(
-            currentLine + word,
-            font,
-            true,
-          );
-
-          if (currentLineWidthTillNow > maxWidth) {
-            push(currentLine);
-            resetParams();
-
-            break;
-          }
-          index++;
-
-          // if word ends with "-" then we don't need to add space
-          // to sync with css word-wrap
-          const shouldAppendSpace = !word.endsWith("-");
-          currentLine += word;
-
-          if (shouldAppendSpace) {
-            currentLine += " ";
-          }
-
-          // Push the word if appending space exceeds max width
-          if (currentLineWidthTillNow + spaceAdvanceWidth >= maxWidth) {
-            if (shouldAppendSpace) {
-              lines.push(currentLine.slice(0, -1));
-            } else {
-              lines.push(currentLine);
-            }
-            resetParams();
-            break;
-          }
-        }
-      }
-    }
-
-    if (currentLine.slice(-1) === " ") {
-      // only remove last trailing space which we have added when joining words
-      currentLine = currentLine.slice(0, -1);
-      push(currentLine);
-    }
+    const wrappedLine = wrapLine(originalLine, font, maxWidth);
+    lines.push(...wrappedLine);
  }

  return lines.join("\n");
@ -577,24 +737,30 @@ export const charWidth = (() => {
  const cachedCharWidth: { [key: FontString]: Array<number> } = {};

  const calculate = (char: string, font: FontString) => {
-    const ascii = char.charCodeAt(0);
+    const unicode = char.charCodeAt(0);
    if (!cachedCharWidth[font]) {
      cachedCharWidth[font] = [];
    }
-    if (!cachedCharWidth[font][ascii]) {
+    if (!cachedCharWidth[font][unicode]) {
      const width = getLineWidth(char, font, true);
-      cachedCharWidth[font][ascii] = width;
+      cachedCharWidth[font][unicode] = width;
    }

-    return cachedCharWidth[font][ascii];
+    return cachedCharWidth[font][unicode];
  };

  const getCache = (font: FontString) => {
    return cachedCharWidth[font];
  };
+
+  const clearCache = (font: FontString) => {
+    cachedCharWidth[font] = [];
+  };
+
  return {
    calculate,
    getCache,
+    clearCache,
  };
 })();

--- a/packages/excalidraw/element/textWysiwyg.test.tsx
+++ b/packages/excalidraw/element/textWysiwyg.test.tsx
@ -917,7 +917,7 @@ describe("textWysiwyg", () => {

      Keyboard.exitTextEditor(editor);
      text = h.elements[1] as ExcalidrawTextElementWithContainer;
-      expect(text.text).toBe("Hello \nWorld!");
+      expect(text.text).toBe("Hello\nWorld!");
      expect(text.originalText).toBe("Hello World!");
      expect(text.y).toBe(
        rectangle.y + h.elements[0].height / 2 - text.height / 2,
@ -1220,7 +1220,7 @@ describe("textWysiwyg", () => {
      );

      expect((h.elements[1] as ExcalidrawTextElementWithContainer).text).toBe(
-        "Online \nwhitebo\nard \ncollabo\nration \nmade \neasy",
+        "Online\nwhiteboa\nrd\ncollabor\nation\nmade\neasy",
      );
      fireEvent.contextMenu(GlobalTestState.interactiveCanvas, {
        button: 2,