Files
upage-git/app/utils/html-parse.ts
2025-09-24 17:02:44 +08:00

172 lines
5.1 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { createScopedLogger } from './logger';
const logger = createScopedLogger('htmlParse');
export function isScriptContent(content: string): boolean {
return content.trim().startsWith('<script');
}
/**
* 验证内容是否有效
* - 检查是否包含完整的 id 属性
* - 检查是否符合内容类型要求HTML、JS、CSS
* @param content 内容字符串
* @returns {boolean} 内容是否有效
*/
export function isValidContent(content: string): boolean {
if (!content || typeof content !== 'string') {
return false;
}
// 处理可能存在的不完整标签
content = sanitizeHtmlContent(content);
try {
// 创建一个临时的 DOM 解析器
const parser = new DOMParser();
const doc = parser.parseFromString(content, 'text/html');
// 检查内容类型
if (content.trim().startsWith('<script')) {
// JavaScript 内容验证
const scriptElements = doc.getElementsByTagName('script');
if (scriptElements.length !== 1) {
logger.warn('JS content must have exactly one script element', {
contentLength: content.length,
elementCount: scriptElements.length,
});
return false;
}
const scriptElement = scriptElements[0];
// 检查脚本元素是否有 id 属性
if (!scriptElement.id) {
logger.warn('JS content must have an id attribute on the script element', { contentLength: content.length });
return false;
}
// 检查 id 是否完整,即属性在原始内容中是否以 id="..." 或 id='...' 的形式完整出现
if (content.indexOf(`id="${scriptElement.id}"`) === -1 && content.indexOf(`id='${scriptElement.id}'`) === -1) {
logger.warn('JS content contains incomplete id attribute', { contentLength: content.length });
return false;
}
// 检查 script 标签是否有完整的闭合标签
if (!content.includes('</script>')) {
logger.warn('JS content must have closing </script> tag', { contentLength: content.length });
return false;
}
return true;
}
if (content.trim().startsWith('<style')) {
// CSS 内容验证
const styleElements = doc.getElementsByTagName('style');
if (styleElements.length !== 1) {
logger.warn('CSS content must have exactly one style element', {
contentLength: content.length,
elementCount: styleElements.length,
});
return false;
}
const styleElement = styleElements[0];
// 检查样式元素是否有 id 属性
if (!styleElement.id) {
logger.warn('CSS content must have an id attribute on the style element', { contentLength: content.length });
return false;
}
if (content.indexOf(`id="${styleElement.id}"`) === -1 && content.indexOf(`id='${styleElement.id}'`) === -1) {
logger.warn('CSS content contains incomplete id attribute', { contentLength: content.length });
return false;
}
// 检查 style 标签是否有完整的闭合标签
if (!content.includes('</style>')) {
logger.warn('style content must have closing </style> tag');
return false;
}
return true;
}
// HTML 内容验证
const bodyChildren = doc.body.children;
if (bodyChildren.length !== 1) {
logger.warn('HTML content must have exactly one root element', {
contentLength: content.length,
rootCount: bodyChildren.length,
});
return false;
}
const rootElement = bodyChildren[0];
// 检查根元素是否有 id 属性
if (!rootElement.id) {
logger.warn('HTML content must have an id attribute on the root element');
return false;
}
if (content.indexOf(`id="${rootElement.id}"`) === -1 && content.indexOf(`id='${rootElement.id}'`) === -1) {
logger.warn('HTML content contains incomplete id attribute');
return false;
}
return true;
} catch (error) {
logger.error('Error validating content', error);
return false;
}
}
/**
* 处理可能存在的不完整内容
* 特别处理末尾可能存在的不完整标签如 </
* @param content 内容字符串
* @returns {string} 处理后的内容
*/
export function sanitizeHtmlContent(content: string): string {
if (!content) {
return content;
}
// 检查是否以不完整的标签结尾
const incompleteEndingRegex = /<\/?[a-zA-Z][a-zA-Z0-9]*$/;
if (incompleteEndingRegex.test(content)) {
// 移除不完整的结束标签
logger.warn(
'Incomplete tag detected at the end of content',
JSON.stringify({
contentEnd: content.slice(-10),
contentLength: content.length,
}),
);
return content.replace(incompleteEndingRegex, '');
}
// 检查是否有不匹配的标签 (简单检查)
const openTags = content.match(/<[a-zA-Z][^>]*>/g) || [];
const closeTags = content.match(/<\/[a-zA-Z][^>]*>/g) || [];
if (openTags.length !== closeTags.length) {
logger.warn(
'Potential unbalanced tags detected',
JSON.stringify({
openTagsCount: openTags.length,
closeTagsCount: closeTags.length,
}),
);
}
return content;
}