Răsfoiți Sursa

feat: clean HTML from product descriptions in Qdrant sync #92

- Created html-cleaner.ts utility for comprehensive HTML cleanup
- Removes HTML tags while preserving paragraph structure
- Decodes HTML entities ( , &, etc.)
- Converts <br/> tags to newlines
- Normalizes whitespace and removes duplicate newlines
- Applied to all product descriptions synced to Qdrant
- Updated shopify-sync, woocommerce-sync, and shoprenter-sync functions
- Deployed and verified all affected Edge Functions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Claude 5 luni în urmă
părinte
comite
dec7b6b

+ 220 - 0
supabase/functions/_shared/html-cleaner.ts

@@ -0,0 +1,220 @@
+/**
+ * HTML Cleaner Utility
+ *
+ * Provides functions to clean HTML content for Qdrant vector database storage.
+ * Removes HTML tags, special characters, and normalizes whitespace while preserving
+ * paragraph structure.
+ */
+
+/**
+ * HTML entity map for decoding common HTML entities
+ */
+const HTML_ENTITIES: Record<string, string> = {
+  '&nbsp;': ' ',
+  '&amp;': '&',
+  '&lt;': '<',
+  '&gt;': '>',
+  '&quot;': '"',
+  '&apos;': "'",
+  '&cent;': '¢',
+  '&pound;': '£',
+  '&yen;': '¥',
+  '&euro;': '€',
+  '&copy;': '©',
+  '&reg;': '®',
+  '&trade;': '™',
+  '&times;': '×',
+  '&divide;': '÷',
+  '&mdash;': '—',
+  '&ndash;': '–',
+  '&hellip;': '…',
+  '&laquo;': '\u00AB',
+  '&raquo;': '\u00BB',
+  '&lsquo;': '\u2018',
+  '&rsquo;': '\u2019',
+  '&ldquo;': '\u201C',
+  '&rdquo;': '\u201D',
+  '&bull;': '•',
+  '&middot;': '·',
+  '&sect;': '§',
+  '&para;': '¶',
+  '&dagger;': '†',
+  '&Dagger;': '‡',
+  '&permil;': '‰',
+  '&prime;': '′',
+  '&Prime;': '″',
+};
+
+/**
+ * Decode HTML entities in text
+ */
+function decodeHtmlEntities(text: string): string {
+  if (!text) return text;
+
+  let decoded = text;
+
+  // Replace named entities
+  for (const [entity, char] of Object.entries(HTML_ENTITIES)) {
+    decoded = decoded.replace(new RegExp(entity, 'g'), char);
+  }
+
+  // Replace numeric entities (&#xxx; and &#xXXX;)
+  decoded = decoded.replace(/&#(\d+);/g, (_, num) => {
+    return String.fromCharCode(parseInt(num, 10));
+  });
+
+  decoded = decoded.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => {
+    return String.fromCharCode(parseInt(hex, 16));
+  });
+
+  return decoded;
+}
+
+/**
+ * Clean HTML content for plain text storage
+ *
+ * This function:
+ * - Converts <br>, <br/>, <br />, and </p> tags to newlines
+ * - Removes all other HTML tags
+ * - Decodes HTML entities (&nbsp;, &amp;, etc.)
+ * - Removes extra whitespace
+ * - Removes duplicate newlines (keeps max 2 consecutive newlines for paragraph separation)
+ * - Trims leading/trailing whitespace
+ *
+ * @param html - HTML string to clean
+ * @returns Plain text with preserved paragraph structure
+ */
+export function cleanHtmlContent(html: string | null | undefined): string {
+  if (!html) return '';
+
+  let text = html;
+
+  // Step 1: Replace block-level tags with newlines to preserve paragraph structure
+  // Convert <br>, <br/>, <br /> tags to newlines
+  text = text.replace(/<br\s*\/?>/gi, '\n');
+
+  // Convert closing paragraph tags to double newlines (paragraph break)
+  text = text.replace(/<\/p>/gi, '\n\n');
+
+  // Convert closing div tags to double newlines
+  text = text.replace(/<\/div>/gi, '\n\n');
+
+  // Convert closing heading tags to double newlines
+  text = text.replace(/<\/(h[1-6])>/gi, '\n\n');
+
+  // Convert list items to newlines with bullet
+  text = text.replace(/<li[^>]*>/gi, '\n• ');
+  text = text.replace(/<\/li>/gi, '');
+
+  // Convert closing list tags to newlines
+  text = text.replace(/<\/(ul|ol)>/gi, '\n');
+
+  // Step 2: Remove all remaining HTML tags
+  text = text.replace(/<[^>]*>/g, '');
+
+  // Step 3: Decode HTML entities
+  text = decodeHtmlEntities(text);
+
+  // Step 4: Normalize whitespace
+  // Replace tabs with spaces
+  text = text.replace(/\t/g, ' ');
+
+  // Replace multiple spaces with single space
+  text = text.replace(/[ ]{2,}/g, ' ');
+
+  // Replace Windows line endings with Unix line endings
+  text = text.replace(/\r\n/g, '\n');
+
+  // Replace Mac line endings with Unix line endings
+  text = text.replace(/\r/g, '\n');
+
+  // Step 5: Remove duplicate newlines (keep max 2 for paragraph separation)
+  text = text.replace(/\n{3,}/g, '\n\n');
+
+  // Step 6: Trim each line (remove leading/trailing spaces)
+  text = text.split('\n').map(line => line.trim()).join('\n');
+
+  // Step 7: Remove leading/trailing newlines
+  text = text.trim();
+
+  return text;
+}
+
+/**
+ * Clean product description specifically (handles both description and short_description)
+ *
+ * @param description - Product description (can be HTML)
+ * @param shortDescription - Short product description (can be HTML)
+ * @returns Cleaned description text
+ */
+export function cleanProductDescription(
+  description: string | null | undefined,
+  shortDescription?: string | null | undefined
+): string {
+  const parts: string[] = [];
+
+  // Clean main description
+  if (description) {
+    const cleaned = cleanHtmlContent(description);
+    if (cleaned) {
+      parts.push(cleaned);
+    }
+  }
+
+  // Clean short description (if different from main description)
+  if (shortDescription && shortDescription !== description) {
+    const cleaned = cleanHtmlContent(shortDescription);
+    if (cleaned && !parts.includes(cleaned)) {
+      parts.push(cleaned);
+    }
+  }
+
+  return parts.join('\n\n');
+}
+
+/**
+ * Test/example function to demonstrate HTML cleaning
+ */
+export function testHtmlCleaner(): void {
+  const examples = [
+    {
+      input: '<p>Product description with <strong>bold</strong> text</p>',
+      expected: 'Product description with bold text'
+    },
+    {
+      input: 'Multiple&nbsp;&nbsp;spaces&amp;entities',
+      expected: 'Multiple  spaces&entities'
+    },
+    {
+      input: '<p>First paragraph</p><p>Second paragraph</p>',
+      expected: 'First paragraph\n\nSecond paragraph'
+    },
+    {
+      input: 'Line 1<br/>Line 2<br>Line 3',
+      expected: 'Line 1\nLine 2\nLine 3'
+    },
+    {
+      input: '<ul><li>Item 1</li><li>Item 2</li></ul>',
+      expected: '• Item 1\n• Item 2'
+    }
+  ];
+
+  console.log('[HTML Cleaner] Running tests...');
+  let passed = 0;
+  let failed = 0;
+
+  for (const example of examples) {
+    const result = cleanHtmlContent(example.input);
+    if (result === example.expected) {
+      console.log(`✓ PASS: "${example.input}" -> "${result}"`);
+      passed++;
+    } else {
+      console.log(`✗ FAIL: "${example.input}"`);
+      console.log(`  Expected: "${example.expected}"`);
+      console.log(`  Got:      "${result}"`);
+      failed++;
+    }
+  }
+
+  console.log(`\n[HTML Cleaner] Tests complete: ${passed} passed, ${failed} failed`);
+}

+ 8 - 6
supabase/functions/_shared/qdrant-client.ts

@@ -8,6 +8,8 @@
  * Metric: Cosine (best for normalized embeddings)
  */
 
+import { cleanHtmlContent } from './html-cleaner.ts';
+
 const QDRANT_URL = 'http://142.93.100.6:6333';
 const QDRANT_API_KEY = 'pyXAyyEPbLzba2RvdBwm';
 const VECTOR_SIZE = 3072;
@@ -514,17 +516,17 @@ export function createProductText(product: any): string {
     parts.push(product.title || product.name);
   }
 
-  // Primary description (cleaned up HTML)
+  // Primary description (cleaned up HTML with proper entity decoding and newline handling)
   if (product.description) {
-    const desc = product.description.replace(/<[^>]*>/g, '').trim();
+    const desc = cleanHtmlContent(product.description);
     if (desc) {
       parts.push(desc);
     }
   }
 
-  // Short description (WooCommerce)
+  // Short description (WooCommerce) - only add if different from main description
   if (product.short_description && product.short_description !== product.description) {
-    const shortDesc = product.short_description.replace(/<[^>]*>/g, '').trim();
+    const shortDesc = cleanHtmlContent(product.short_description);
     if (shortDesc) {
       parts.push(shortDesc);
     }
@@ -595,9 +597,9 @@ export function createProductText(product: any): string {
     }
   }
 
-  // Meta description (ShopRenter/SEO)
+  // Meta description (ShopRenter/SEO) - only add if different from main description
   if (product.meta_description && product.meta_description !== product.description) {
-    const metaDesc = product.meta_description.replace(/<[^>]*>/g, '').trim();
+    const metaDesc = cleanHtmlContent(product.meta_description);
     if (metaDesc) {
       parts.push(metaDesc);
     }