Skip to content

Commit c476160

Browse files
committed
webui: Moved preprocessLaTeX to lib/utils
1 parent 9b31191 commit c476160

File tree

3 files changed

+223
-84
lines changed

3 files changed

+223
-84
lines changed

tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte

Lines changed: 1 addition & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import rehypeKatex from 'rehype-katex';
99
import rehypeStringify from 'rehype-stringify';
1010
import { copyCodeToClipboard } from '$lib/utils/copy';
11-
import { maskInlineLaTeX } from '$lib/utils/latex-protection';
11+
import { preprocessLaTeX } from '$lib/utils/latex-protection';
1212
import { browser } from '$app/environment';
1313
import 'katex/dist/katex.min.css';
1414
@@ -155,87 +155,6 @@
155155
return mutated ? tempDiv.innerHTML : html;
156156
}
157157
158-
// See also:
159-
// https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
160-
161-
// Protect code blocks: ```...``` and `...`
162-
const codeBlockRegex = /(```[\s\S]*?```|`[^`\n]+`)/g;
163-
164-
export function preprocessLaTeX(content: string): string {
165-
// Step 1: Protect code blocks
166-
const codeBlocks: string[] = [];
167-
content = content.replace(codeBlockRegex, (match) => {
168-
codeBlocks.push(match);
169-
return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
170-
});
171-
172-
// Step 2: Protect existing LaTeX expressions
173-
const latexExpressions: string[] = [];
174-
175-
// Match \(...\), \[...\], $$...$$ and protect them
176-
content = content.replace(/(\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]|\\\(.*?\\\))/g, (match) => {
177-
latexExpressions.push(match);
178-
return `<<LATEX_${latexExpressions.length - 1}>>`;
179-
});
180-
181-
// Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99)
182-
content = maskInlineLaTeX(content, latexExpressions);
183-
184-
// Step 3: Escape standalone $ before digits (currency like $5 → \$5)
185-
// (Now that inline math is protected, this will only escape dollars not already protected)
186-
content = content.replace(/\$(?=\d)/g, '\\$');
187-
188-
// Step 4: Restore protected LaTeX expressions (they are valid)
189-
content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => {
190-
return latexExpressions[parseInt(index)];
191-
});
192-
193-
// Step 5: Restore code blocks
194-
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
195-
return codeBlocks[parseInt(index)];
196-
});
197-
198-
// Step 6: Apply additional escaping functions (brackets and mhchem)
199-
content = escapeBrackets(content);
200-
if (content.includes('\\ce{') || content.includes('\\pu{')) {
201-
content = escapeMhchem(content);
202-
}
203-
204-
// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
205-
content = content
206-
.replace(/\\\((.+?)\\\)/g, '$$$1$') // inline
207-
.replace(/\\\[(.+?)\\\]/g, '$$$$1$$'); // display
208-
209-
return content;
210-
}
211-
212-
function escapeBrackets(text: string): string {
213-
const pattern = /(```[\S\s]*?```|`.*?`)|\\\[([\S\s]*?[^\\])\\]|\\\((.*?)\\\)/g;
214-
return text.replace(
215-
pattern,
216-
(
217-
match: string,
218-
codeBlock: string | undefined,
219-
squareBracket: string | undefined,
220-
roundBracket: string | undefined
221-
): string => {
222-
if (codeBlock != null) {
223-
return codeBlock;
224-
} else if (squareBracket != null) {
225-
return `$$${squareBracket}$$`;
226-
} else if (roundBracket != null) {
227-
return `$${roundBracket}$`;
228-
}
229-
return match;
230-
}
231-
);
232-
}
233-
234-
// Escape $\\ce{...} → $\\ce{...} but with proper handling
235-
function escapeMhchem(text: string): string {
236-
return text.replaceAll('$\\ce{', '$\\\\ce{').replaceAll('$\\pu{', '$\\\\pu{');
237-
}
238-
239158
async function processMarkdown(text: string): Promise<string> {
240159
try {
241160
const processedText = preprocessLaTeX(text);

tools/server/webui/src/lib/utils/latex-protection.test.ts

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
import { describe, it, expect } from 'vitest';
2-
import { maskInlineLaTeX } from './latex-protection';
1+
/* eslint-disable no-irregular-whitespace */
2+
import { describe, it, expect, test } from 'vitest';
3+
import { maskInlineLaTeX, preprocessLaTeX } from './latex-protection';
34

45
describe('maskInlineLaTeX', () => {
56
it('should protect LaTeX $x + y$ but not money $3.99', () => {
@@ -101,3 +102,122 @@ describe('maskInlineLaTeX', () => {
101102
expect(latexExpressions).toEqual([]);
102103
});
103104
});
105+
106+
describe('preprocessLaTeX', () => {
107+
test('converts inline \\( ... \\) to $...$', () => {
108+
const input =
109+
'\\( \\mathrm{GL}_2(\\mathbb{F}_7) \\): Group of invertible matrices with entries in \\(\\mathbb{F}_7\\).';
110+
const output = preprocessLaTeX(input);
111+
expect(output).toBe(
112+
'$ \\mathrm{GL}_2(\\mathbb{F}_7) $: Group of invertible matrices with entries in $\\mathbb{F}_7$.'
113+
);
114+
});
115+
116+
test('preserves display math \\[ ... \\] and protects adjacent text', () => {
117+
const input = `Some kernel of \\(\\mathrm{SL}_2(\\mathbb{F}_7)\\):
118+
\\[
119+
\\left\\{ \\begin{pmatrix} 1 & 0 \\\\ 0 & 1 \\end{pmatrix}, \\begin{pmatrix} -1 & 0 \\\\ 0 & -1 \\end{pmatrix} \\right\\} = \\{\\pm I\\}
120+
\\]`;
121+
const output = preprocessLaTeX(input);
122+
123+
expect(output).toBe(`Some kernel of $\\mathrm{SL}_2(\\mathbb{F}_7)$:
124+
$$
125+
\\left\\{ \\begin{pmatrix} 1 & 0 \\\\ 0 & 1 \\end{pmatrix}, \\begin{pmatrix} -1 & 0 \\\\ 0 & -1 \\end{pmatrix} \\right\\} = \\{\\pm I\\}
126+
$$`);
127+
});
128+
129+
test('handles standalone display math equation', () => {
130+
const input = `Algebra:
131+
\\[
132+
x = \\frac{-b \\pm \\sqrt{\\,b^{2}-4ac\\,}}{2a}
133+
\\]`;
134+
const output = preprocessLaTeX(input);
135+
136+
expect(output).toBe(`Algebra:
137+
$$
138+
x = \\frac{-b \\pm \\sqrt{\\,b^{2}-4ac\\,}}{2a}
139+
$$`);
140+
});
141+
142+
test('does not interpret currency values as LaTeX', () => {
143+
const input = 'I have $10, $3.99 and $x + y$ and $100x$. The amount is $2,000.';
144+
const output = preprocessLaTeX(input);
145+
146+
expect(output).toBe('I have \\$10, \\$3.99 and $x + y$ and $100x$. The amount is \\$2,000.');
147+
});
148+
149+
test('ignores dollar signs followed by digits (money), but keeps valid math $x + y$', () => {
150+
const input = 'I have $10, $3.99 and $x + y$ and $100x$. The amount is $2,000.';
151+
const output = preprocessLaTeX(input);
152+
153+
expect(output).toBe('I have \\$10, \\$3.99 and $x + y$ and $100x$. The amount is \\$2,000.');
154+
});
155+
156+
test('handles real-world word problems with amounts and no math delimiters', () => {
157+
const input =
158+
'Emma buys 2 cupcakes for $3 each and 1 cookie for $1.50. How much money does she spend in total?';
159+
const output = preprocessLaTeX(input);
160+
161+
expect(output).toBe(
162+
'Emma buys 2 cupcakes for \\$3 each and 1 cookie for \\$1.50. How much money does she spend in total?'
163+
);
164+
});
165+
166+
test('handles decimal amounts in word problem correctly', () => {
167+
const input =
168+
'Maria has $20. She buys a notebook for $4.75 and a pack of pencils for $3.25. How much change does she receive?';
169+
const output = preprocessLaTeX(input);
170+
171+
expect(output).toBe(
172+
'Maria has \\$20. She buys a notebook for \\$4.75 and a pack of pencils for \\$3.25. How much change does she receive?'
173+
);
174+
});
175+
176+
test('preserves display math with surrounding non-ASCII text', () => {
177+
const input = `1 kg の質量は
178+
\\[
179+
E = (1\\ \\text{kg}) \\times (3.0 \\times 10^8\\ \\text{m/s})^2 \\approx 9.0 \\times 10^{16}\\ \\text{J}
180+
\\]
181+
というエネルギーに相当します。これは約 21 百万トンの TNT が爆発したときのエネルギーに匹敵します。`;
182+
const output = preprocessLaTeX(input);
183+
184+
expect(output).toBe(
185+
`1 kg の質量は
186+
$$
187+
E = (1\\ \\text{kg}) \\times (3.0 \\times 10^8\\ \\text{m/s})^2 \\approx 9.0 \\times 10^{16}\\ \\text{J}
188+
$$
189+
というエネルギーに相当します。これは約 21 百万トンの TNT が爆発したときのエネルギーに匹敵します。`
190+
);
191+
});
192+
193+
test('converts \\[ ... \\] even when preceded by text without space', () => {
194+
const input = 'Algebra: \\[x = \\frac{-b \\pm \\sqrt{\\,b^{2}-4ac\\,}}{2a}\\]';
195+
const output = preprocessLaTeX(input);
196+
197+
expect(output).toBe('Algebra: $$x = \\frac{-b \\pm \\sqrt{\\,b^{2}-4ac\\,}}{2a}$$');
198+
});
199+
200+
test('escapes isolated $ before digits ($5 → \\$5), but not valid math', () => {
201+
const input = 'This costs $5 and this is math $x^2$. $100 is money.';
202+
const output = preprocessLaTeX(input);
203+
204+
expect(output).toBe('This costs \\$5 and this is math $x^2$. \\$100 is money.');
205+
// Note: Since $x^2$ is detected as valid LaTeX, it's preserved.
206+
// $5 becomes \$5 only *after* real math is masked — but here it's correct because the masking logic avoids treating $5 as math.
207+
});
208+
209+
test('handles mhchem notation safely if present', () => {
210+
const input = 'Chemical reaction: \\( \\ce{H2O} \\) and $\\ce{CO2}$';
211+
const output = preprocessLaTeX(input);
212+
213+
expect(output).toBe('Chemical reaction: $ \\ce{H2O} $ and $\\\\ce{CO2}$');
214+
// Note: \\ce{...} remains, but $\\ce{...} → $\\\\ce{...} via escapeMhchem
215+
});
216+
217+
test('preserves code blocks', () => {
218+
const input = 'Inline code: `sum $total` and block:\n```\ndollar $amount\n```\nEnd.';
219+
const output = preprocessLaTeX(input);
220+
221+
expect(output).toBe(input); // Code blocks prevent misinterpretation
222+
});
223+
});

tools/server/webui/src/lib/utils/latex-protection.ts

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,103 @@ export function maskInlineLaTeX(content: string, latexExpressions: string[]): st
7474
})
7575
.join('\n');
7676
}
77+
78+
function escapeBrackets(text: string): string {
79+
const pattern = /(```[\S\s]*?```|`.*?`)|\\\[([\S\s]*?[^\\])\\]|\\\((.*?)\\\)/g;
80+
return text.replace(
81+
pattern,
82+
(
83+
match: string,
84+
codeBlock: string | undefined,
85+
squareBracket: string | undefined,
86+
roundBracket: string | undefined
87+
): string => {
88+
if (codeBlock != null) {
89+
return codeBlock;
90+
} else if (squareBracket != null) {
91+
return `$$${squareBracket}$$`;
92+
} else if (roundBracket != null) {
93+
return `$${roundBracket}$`;
94+
}
95+
return match;
96+
}
97+
);
98+
}
99+
100+
// Escape $\\ce{...} → $\\ce{...} but with proper handling
101+
function escapeMhchem(text: string): string {
102+
return text.replaceAll('$\\ce{', '$\\\\ce{').replaceAll('$\\pu{', '$\\\\pu{');
103+
}
104+
105+
// See also:
106+
// https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
107+
108+
// Protect code blocks: ```...``` and `...`
109+
const codeBlockRegex = /(```[\s\S]*?```|`[^`\n]+`)/g;
110+
111+
/**
112+
* Preprocesses markdown content to safely handle LaTeX math expressions while protecting
113+
* against false positives (e.g., dollar amounts like $5.99) and ensuring proper rendering.
114+
*
115+
* This function:
116+
* - Protects code blocks (```) and inline code (`...`)
117+
* - Safeguards block and inline LaTeX: \(...\), \[...\], $$...$$, and selective $...$
118+
* - Escapes standalone dollar signs before numbers (e.g., $5 → \$5) to prevent misinterpretation
119+
* - Restores protected LaTeX and code blocks after processing
120+
* - Converts \(...\) → $...$ and \[...\] → $$...$$ for compatibility with math renderers
121+
* - Applies additional escaping for brackets and mhchem syntax if needed
122+
*
123+
* @param content - The raw text (e.g., markdown) that may contain LaTeX or code blocks.
124+
* @returns The preprocessed string with properly escaped and normalized LaTeX.
125+
*
126+
* @example
127+
* preprocessLaTeX("Price: $10. The equation is \\(x^2\\).")
128+
* // → "Price: $10. The equation is $x^2$."
129+
*/
130+
export function preprocessLaTeX(content: string): string {
131+
// Step 1: Protect code blocks
132+
const codeBlocks: string[] = [];
133+
content = content.replace(codeBlockRegex, (match) => {
134+
codeBlocks.push(match);
135+
return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
136+
});
137+
138+
// Step 2: Protect existing LaTeX expressions
139+
const latexExpressions: string[] = [];
140+
141+
// Match \(...\), \[...\], $$...$$ and protect them
142+
content = content.replace(/(\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]|\\\(.*?\\\))/g, (match) => {
143+
latexExpressions.push(match);
144+
return `<<LATEX_${latexExpressions.length - 1}>>`;
145+
});
146+
147+
// Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99)
148+
content = maskInlineLaTeX(content, latexExpressions);
149+
150+
// Step 3: Escape standalone $ before digits (currency like $5 → \$5)
151+
// (Now that inline math is protected, this will only escape dollars not already protected)
152+
content = content.replace(/\$(?=\d)/g, '\\$');
153+
154+
// Step 4: Restore protected LaTeX expressions (they are valid)
155+
content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => {
156+
return latexExpressions[parseInt(index)];
157+
});
158+
159+
// Step 5: Restore code blocks
160+
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
161+
return codeBlocks[parseInt(index)];
162+
});
163+
164+
// Step 6: Apply additional escaping functions (brackets and mhchem)
165+
content = escapeBrackets(content);
166+
if (content.includes('\\ce{') || content.includes('\\pu{')) {
167+
content = escapeMhchem(content);
168+
}
169+
170+
// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
171+
content = content
172+
.replace(/\\\((.+?)\\\)/g, '$$$1$') // inline
173+
.replace(/\\\[(.+?)\\\]/g, '$$$$1$$'); // display
174+
175+
return content;
176+
}

0 commit comments

Comments
 (0)