import { createScopedLogger } from './logger';
const logger = createScopedLogger('htmlParse');
export function isScriptContent(content: string): boolean {
return content.trim().startsWith(' tag', { contentLength: content.length });
return false;
}
return true;
}
if (content.trim().startsWith(' tag');
return false;
}
return true;
}
// HTML 内容验证
const bodyChildren = doc.body.children;
if (bodyChildren.length !== 1) {
logger.warn('HTML content must have exactly one root element', {
contentLength: content.length,
rootCount: bodyChildren.length,
});
return false;
}
const rootElement = bodyChildren[0];
// 检查根元素是否有 id 属性
if (!rootElement.id) {
logger.warn('HTML content must have an id attribute on the root element');
return false;
}
if (content.indexOf(`id="${rootElement.id}"`) === -1 && content.indexOf(`id='${rootElement.id}'`) === -1) {
logger.warn('HTML content contains incomplete id attribute');
return false;
}
return true;
} catch (error) {
logger.error('Error validating content', error);
return false;
}
}
/**
* 处理可能存在的不完整内容
* 特别处理末尾可能存在的不完整标签如
* @param content 内容字符串
* @returns {string} 处理后的内容
*/
export function sanitizeHtmlContent(content: string): string {
if (!content) {
return content;
}
// 检查是否以不完整的标签结尾
const incompleteEndingRegex = /<\/?[a-zA-Z][a-zA-Z0-9]*$/;
if (incompleteEndingRegex.test(content)) {
// 移除不完整的结束标签
logger.warn(
'Incomplete tag detected at the end of content',
JSON.stringify({
contentEnd: content.slice(-10),
contentLength: content.length,
}),
);
return content.replace(incompleteEndingRegex, '');
}
// 检查是否有不匹配的标签 (简单检查)
const openTags = content.match(/<[a-zA-Z][^>]*>/g) || [];
const closeTags = content.match(/<\/[a-zA-Z][^>]*>/g) || [];
if (openTags.length !== closeTags.length) {
logger.warn(
'Potential unbalanced tags detected',
JSON.stringify({
openTagsCount: openTags.length,
closeTagsCount: closeTags.length,
}),
);
}
return content;
}