|
|
@@ -0,0 +1,220 @@
|
|
|
+/**
|
|
|
+ * HTML Cleaner Utility
|
|
|
+ *
|
|
|
+ * Provides functions to clean HTML content for Qdrant vector database storage.
|
|
|
+ * Removes HTML tags, special characters, and normalizes whitespace while preserving
|
|
|
+ * paragraph structure.
|
|
|
+ */
|
|
|
+
|
|
|
+/**
|
|
|
+ * HTML entity map for decoding common HTML entities
|
|
|
+ */
|
|
|
+const HTML_ENTITIES: Record<string, string> = {
|
|
|
+ ' ': ' ',
|
|
|
+ '&': '&',
|
|
|
+ '<': '<',
|
|
|
+ '>': '>',
|
|
|
+ '"': '"',
|
|
|
+ ''': "'",
|
|
|
+ '¢': '¢',
|
|
|
+ '£': '£',
|
|
|
+ '¥': '¥',
|
|
|
+ '€': '€',
|
|
|
+ '©': '©',
|
|
|
+ '®': '®',
|
|
|
+ '™': '™',
|
|
|
+ '×': '×',
|
|
|
+ '÷': '÷',
|
|
|
+ '—': '—',
|
|
|
+ '–': '–',
|
|
|
+ '…': '…',
|
|
|
+ '«': '\u00AB',
|
|
|
+ '»': '\u00BB',
|
|
|
+ '‘': '\u2018',
|
|
|
+ '’': '\u2019',
|
|
|
+ '“': '\u201C',
|
|
|
+ '”': '\u201D',
|
|
|
+ '•': '•',
|
|
|
+ '·': '·',
|
|
|
+ '§': '§',
|
|
|
+ '¶': '¶',
|
|
|
+ '†': '†',
|
|
|
+ '‡': '‡',
|
|
|
+ '‰': '‰',
|
|
|
+ '′': '′',
|
|
|
+ '″': '″',
|
|
|
+};
|
|
|
+
|
|
|
+/**
|
|
|
+ * Decode HTML entities in text
|
|
|
+ */
|
|
|
+function decodeHtmlEntities(text: string): string {
|
|
|
+ if (!text) return text;
|
|
|
+
|
|
|
+ let decoded = text;
|
|
|
+
|
|
|
+ // Replace named entities
|
|
|
+ for (const [entity, char] of Object.entries(HTML_ENTITIES)) {
|
|
|
+ decoded = decoded.replace(new RegExp(entity, 'g'), char);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Replace numeric entities (&#xxx; and &#xXXX;)
|
|
|
+ decoded = decoded.replace(/&#(\d+);/g, (_, num) => {
|
|
|
+ return String.fromCharCode(parseInt(num, 10));
|
|
|
+ });
|
|
|
+
|
|
|
+ decoded = decoded.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => {
|
|
|
+ return String.fromCharCode(parseInt(hex, 16));
|
|
|
+ });
|
|
|
+
|
|
|
+ return decoded;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Clean HTML content for plain text storage
|
|
|
+ *
|
|
|
+ * This function:
|
|
|
+ * - Converts <br>, <br/>, <br />, and </p> tags to newlines
|
|
|
+ * - Removes all other HTML tags
|
|
|
+ * - Decodes HTML entities ( , &, etc.)
|
|
|
+ * - Removes extra whitespace
|
|
|
+ * - Removes duplicate newlines (keeps max 2 consecutive newlines for paragraph separation)
|
|
|
+ * - Trims leading/trailing whitespace
|
|
|
+ *
|
|
|
+ * @param html - HTML string to clean
|
|
|
+ * @returns Plain text with preserved paragraph structure
|
|
|
+ */
|
|
|
+export function cleanHtmlContent(html: string | null | undefined): string {
|
|
|
+ if (!html) return '';
|
|
|
+
|
|
|
+ let text = html;
|
|
|
+
|
|
|
+ // Step 1: Replace block-level tags with newlines to preserve paragraph structure
|
|
|
+ // Convert <br>, <br/>, <br /> tags to newlines
|
|
|
+ text = text.replace(/<br\s*\/?>/gi, '\n');
|
|
|
+
|
|
|
+ // Convert closing paragraph tags to double newlines (paragraph break)
|
|
|
+ text = text.replace(/<\/p>/gi, '\n\n');
|
|
|
+
|
|
|
+ // Convert closing div tags to double newlines
|
|
|
+ text = text.replace(/<\/div>/gi, '\n\n');
|
|
|
+
|
|
|
+ // Convert closing heading tags to double newlines
|
|
|
+ text = text.replace(/<\/(h[1-6])>/gi, '\n\n');
|
|
|
+
|
|
|
+ // Convert list items to newlines with bullet
|
|
|
+ text = text.replace(/<li[^>]*>/gi, '\n• ');
|
|
|
+ text = text.replace(/<\/li>/gi, '');
|
|
|
+
|
|
|
+ // Convert closing list tags to newlines
|
|
|
+ text = text.replace(/<\/(ul|ol)>/gi, '\n');
|
|
|
+
|
|
|
+ // Step 2: Remove all remaining HTML tags
|
|
|
+ text = text.replace(/<[^>]*>/g, '');
|
|
|
+
|
|
|
+ // Step 3: Decode HTML entities
|
|
|
+ text = decodeHtmlEntities(text);
|
|
|
+
|
|
|
+ // Step 4: Normalize whitespace
|
|
|
+ // Replace tabs with spaces
|
|
|
+ text = text.replace(/\t/g, ' ');
|
|
|
+
|
|
|
+ // Replace multiple spaces with single space
|
|
|
+ text = text.replace(/[ ]{2,}/g, ' ');
|
|
|
+
|
|
|
+ // Replace Windows line endings with Unix line endings
|
|
|
+ text = text.replace(/\r\n/g, '\n');
|
|
|
+
|
|
|
+ // Replace Mac line endings with Unix line endings
|
|
|
+ text = text.replace(/\r/g, '\n');
|
|
|
+
|
|
|
+ // Step 5: Remove duplicate newlines (keep max 2 for paragraph separation)
|
|
|
+ text = text.replace(/\n{3,}/g, '\n\n');
|
|
|
+
|
|
|
+ // Step 6: Trim each line (remove leading/trailing spaces)
|
|
|
+ text = text.split('\n').map(line => line.trim()).join('\n');
|
|
|
+
|
|
|
+ // Step 7: Remove leading/trailing newlines
|
|
|
+ text = text.trim();
|
|
|
+
|
|
|
+ return text;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Clean product description specifically (handles both description and short_description)
|
|
|
+ *
|
|
|
+ * @param description - Product description (can be HTML)
|
|
|
+ * @param shortDescription - Short product description (can be HTML)
|
|
|
+ * @returns Cleaned description text
|
|
|
+ */
|
|
|
+export function cleanProductDescription(
|
|
|
+ description: string | null | undefined,
|
|
|
+ shortDescription?: string | null | undefined
|
|
|
+): string {
|
|
|
+ const parts: string[] = [];
|
|
|
+
|
|
|
+ // Clean main description
|
|
|
+ if (description) {
|
|
|
+ const cleaned = cleanHtmlContent(description);
|
|
|
+ if (cleaned) {
|
|
|
+ parts.push(cleaned);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Clean short description (if different from main description)
|
|
|
+ if (shortDescription && shortDescription !== description) {
|
|
|
+ const cleaned = cleanHtmlContent(shortDescription);
|
|
|
+ if (cleaned && !parts.includes(cleaned)) {
|
|
|
+ parts.push(cleaned);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return parts.join('\n\n');
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Test/example function to demonstrate HTML cleaning
|
|
|
+ */
|
|
|
+export function testHtmlCleaner(): void {
|
|
|
+ const examples = [
|
|
|
+ {
|
|
|
+ input: '<p>Product description with <strong>bold</strong> text</p>',
|
|
|
+ expected: 'Product description with bold text'
|
|
|
+ },
|
|
|
+ {
|
|
|
+ input: 'Multiple spaces&entities',
|
|
|
+ expected: 'Multiple spaces&entities'
|
|
|
+ },
|
|
|
+ {
|
|
|
+ input: '<p>First paragraph</p><p>Second paragraph</p>',
|
|
|
+ expected: 'First paragraph\n\nSecond paragraph'
|
|
|
+ },
|
|
|
+ {
|
|
|
+ input: 'Line 1<br/>Line 2<br>Line 3',
|
|
|
+ expected: 'Line 1\nLine 2\nLine 3'
|
|
|
+ },
|
|
|
+ {
|
|
|
+ input: '<ul><li>Item 1</li><li>Item 2</li></ul>',
|
|
|
+ expected: '• Item 1\n• Item 2'
|
|
|
+ }
|
|
|
+ ];
|
|
|
+
|
|
|
+ console.log('[HTML Cleaner] Running tests...');
|
|
|
+ let passed = 0;
|
|
|
+ let failed = 0;
|
|
|
+
|
|
|
+ for (const example of examples) {
|
|
|
+ const result = cleanHtmlContent(example.input);
|
|
|
+ if (result === example.expected) {
|
|
|
+ console.log(`✓ PASS: "${example.input}" -> "${result}"`);
|
|
|
+ passed++;
|
|
|
+ } else {
|
|
|
+ console.log(`✗ FAIL: "${example.input}"`);
|
|
|
+ console.log(` Expected: "${example.expected}"`);
|
|
|
+ console.log(` Got: "${result}"`);
|
|
|
+ failed++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ console.log(`\n[HTML Cleaner] Tests complete: ${passed} passed, ${failed} failed`);
|
|
|
+}
|