5 luni în urmă · dec7b6bada
--- a/supabase/functions/_shared/html-cleaner.ts
+++ b/supabase/functions/_shared/html-cleaner.ts
@@ -0,0 +1,220 @@
 
				+/**
			
 
				+ * HTML Cleaner Utility
			
 
				+ *
			
 
				+ * Provides functions to clean HTML content for Qdrant vector database storage.
			
 
				+ * Removes HTML tags, special characters, and normalizes whitespace while preserving
			
 
				+ * paragraph structure.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * HTML entity map for decoding common HTML entities
			
 
				+ */
			
 
				+const HTML_ENTITIES: Record<string, string> = {
			
 
				+  '&nbsp;': ' ',
			
 
				+  '&amp;': '&',
			
 
				+  '&lt;': '<',
			
 
				+  '&gt;': '>',
			
 
				+  '&quot;': '"',
			
 
				+  '&apos;': "'",
			
 
				+  '&cent;': '¢',
			
 
				+  '&pound;': '£',
			
 
				+  '&yen;': '¥',
			
 
				+  '&euro;': '€',
			
 
				+  '&copy;': '©',
			
 
				+  '&reg;': '®',
			
 
				+  '&trade;': '™',
			
 
				+  '&times;': '×',
			
 
				+  '&divide;': '÷',
			
 
				+  '&mdash;': '—',
			
 
				+  '&ndash;': '–',
			
 
				+  '&hellip;': '…',
			
 
				+  '&laquo;': '\u00AB',
			
 
				+  '&raquo;': '\u00BB',
			
 
				+  '&lsquo;': '\u2018',
			
 
				+  '&rsquo;': '\u2019',
			
 
				+  '&ldquo;': '\u201C',
			
 
				+  '&rdquo;': '\u201D',
			
 
				+  '&bull;': '•',
			
 
				+  '&middot;': '·',
			
 
				+  '&sect;': '§',
			
 
				+  '&para;': '¶',
			
 
				+  '&dagger;': '†',
			
 
				+  '&Dagger;': '‡',
			
 
				+  '&permil;': '‰',
			
 
				+  '&prime;': '′',
			
 
				+  '&Prime;': '″',
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * Decode HTML entities in text
			
 
				+ */
			
 
				+function decodeHtmlEntities(text: string): string {
			
 
				+  if (!text) return text;
			
 
				+
			
 
				+  let decoded = text;
			
 
				+
			
 
				+  // Replace named entities
			
 
				+  for (const [entity, char] of Object.entries(HTML_ENTITIES)) {
			
 
				+    decoded = decoded.replace(new RegExp(entity, 'g'), char);
			
 
				+  }
			
 
				+
			
 
				+  // Replace numeric entities (&#xxx; and &#xXXX;)
			
 
				+  decoded = decoded.replace(/&#(\d+);/g, (_, num) => {
			
 
				+    return String.fromCharCode(parseInt(num, 10));
			
 
				+  });
			
 
				+
			
 
				+  decoded = decoded.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => {
			
 
				+    return String.fromCharCode(parseInt(hex, 16));
			
 
				+  });
			
 
				+
			
 
				+  return decoded;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Clean HTML content for plain text storage
			
 
				+ *
			
 
				+ * This function:
			
 
				+ * - Converts <br>, <br/>, <br />, and </p> tags to newlines
			
 
				+ * - Removes all other HTML tags
			
 
				+ * - Decodes HTML entities (&nbsp;, &amp;, etc.)
			
 
				+ * - Removes extra whitespace
			
 
				+ * - Removes duplicate newlines (keeps max 2 consecutive newlines for paragraph separation)
			
 
				+ * - Trims leading/trailing whitespace
			
 
				+ *
			
 
				+ * @param html - HTML string to clean
			
 
				+ * @returns Plain text with preserved paragraph structure
			
 
				+ */
			
 
				+export function cleanHtmlContent(html: string | null | undefined): string {
			
 
				+  if (!html) return '';
			
 
				+
			
 
				+  let text = html;
			
 
				+
			
 
				+  // Step 1: Replace block-level tags with newlines to preserve paragraph structure
			
 
				+  // Convert <br>, <br/>, <br /> tags to newlines
			
 
				+  text = text.replace(/<br\s*\/?>/gi, '\n');
			
 
				+
			
 
				+  // Convert closing paragraph tags to double newlines (paragraph break)
			
 
				+  text = text.replace(/<\/p>/gi, '\n\n');
			
 
				+
			
 
				+  // Convert closing div tags to double newlines
			
 
				+  text = text.replace(/<\/div>/gi, '\n\n');
			
 
				+
			
 
				+  // Convert closing heading tags to double newlines
			
 
				+  text = text.replace(/<\/(h[1-6])>/gi, '\n\n');
			
 
				+
			
 
				+  // Convert list items to newlines with bullet
			
 
				+  text = text.replace(/<li[^>]*>/gi, '\n• ');
			
 
				+  text = text.replace(/<\/li>/gi, '');
			
 
				+
			
 
				+  // Convert closing list tags to newlines
			
 
				+  text = text.replace(/<\/(ul|ol)>/gi, '\n');
			
 
				+
			
 
				+  // Step 2: Remove all remaining HTML tags
			
 
				+  text = text.replace(/<[^>]*>/g, '');
			
 
				+
			
 
				+  // Step 3: Decode HTML entities
			
 
				+  text = decodeHtmlEntities(text);
			
 
				+
			
 
				+  // Step 4: Normalize whitespace
			
 
				+  // Replace tabs with spaces
			
 
				+  text = text.replace(/\t/g, ' ');
			
 
				+
			
 
				+  // Replace multiple spaces with single space
			
 
				+  text = text.replace(/[ ]{2,}/g, ' ');
			
 
				+
			
 
				+  // Replace Windows line endings with Unix line endings
			
 
				+  text = text.replace(/\r\n/g, '\n');
			
 
				+
			
 
				+  // Replace Mac line endings with Unix line endings
			
 
				+  text = text.replace(/\r/g, '\n');
			
 
				+
			
 
				+  // Step 5: Remove duplicate newlines (keep max 2 for paragraph separation)
			
 
				+  text = text.replace(/\n{3,}/g, '\n\n');
			
 
				+
			
 
				+  // Step 6: Trim each line (remove leading/trailing spaces)
			
 
				+  text = text.split('\n').map(line => line.trim()).join('\n');
			
 
				+
			
 
				+  // Step 7: Remove leading/trailing newlines
			
 
				+  text = text.trim();
			
 
				+
			
 
				+  return text;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Clean product description specifically (handles both description and short_description)
			
 
				+ *
			
 
				+ * @param description - Product description (can be HTML)
			
 
				+ * @param shortDescription - Short product description (can be HTML)
			
 
				+ * @returns Cleaned description text
			
 
				+ */
			
 
				+export function cleanProductDescription(
			
 
				+  description: string | null | undefined,
			
 
				+  shortDescription?: string | null | undefined
			
 
				+): string {
			
 
				+  const parts: string[] = [];
			
 
				+
			
 
				+  // Clean main description
			
 
				+  if (description) {
			
 
				+    const cleaned = cleanHtmlContent(description);
			
 
				+    if (cleaned) {
			
 
				+      parts.push(cleaned);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Clean short description (if different from main description)
			
 
				+  if (shortDescription && shortDescription !== description) {
			
 
				+    const cleaned = cleanHtmlContent(shortDescription);
			
 
				+    if (cleaned && !parts.includes(cleaned)) {
			
 
				+      parts.push(cleaned);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return parts.join('\n\n');
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Test/example function to demonstrate HTML cleaning
			
 
				+ */
			
 
				+export function testHtmlCleaner(): void {
			
 
				+  const examples = [
			
 
				+    {
			
 
				+      input: '<p>Product description with <strong>bold</strong> text</p>',
			
 
				+      expected: 'Product description with bold text'
			
 
				+    },
			
 
				+    {
			
 
				+      input: 'Multiple&nbsp;&nbsp;spaces&amp;entities',
			
 
				+      expected: 'Multiple  spaces&entities'
			
 
				+    },
			
 
				+    {
			
 
				+      input: '<p>First paragraph</p><p>Second paragraph</p>',
			
 
				+      expected: 'First paragraph\n\nSecond paragraph'
			
 
				+    },
			
 
				+    {
			
 
				+      input: 'Line 1<br/>Line 2<br>Line 3',
			
 
				+      expected: 'Line 1\nLine 2\nLine 3'
			
 
				+    },
			
 
				+    {
			
 
				+      input: '<ul><li>Item 1</li><li>Item 2</li></ul>',
			
 
				+      expected: '• Item 1\n• Item 2'
			
 
				+    }
			
 
				+  ];
			
 
				+
			
 
				+  console.log('[HTML Cleaner] Running tests...');
			
 
				+  let passed = 0;
			
 
				+  let failed = 0;
			
 
				+
			
 
				+  for (const example of examples) {
			
 
				+    const result = cleanHtmlContent(example.input);
			
 
				+    if (result === example.expected) {
			
 
				+      console.log(`✓ PASS: "${example.input}" -> "${result}"`);
			
 
				+      passed++;
			
 
				+    } else {
			
 
				+      console.log(`✗ FAIL: "${example.input}"`);
			
 
				+      console.log(`  Expected: "${example.expected}"`);
			
 
				+      console.log(`  Got:      "${result}"`);
			
 
				+      failed++;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  console.log(`\n[HTML Cleaner] Tests complete: ${passed} passed, ${failed} failed`);
			
 
				+}
			
--- a/supabase/functions/_shared/qdrant-client.ts
+++ b/supabase/functions/_shared/qdrant-client.ts
@@ -8,6 +8,8 @@
 
				  * Metric: Cosine (best for normalized embeddings)
			
 
				  */
			
 
				 
			
 
				+import { cleanHtmlContent } from './html-cleaner.ts';
			
 
				+
			
 
				 const QDRANT_URL = 'http://142.93.100.6:6333';
			
 
				 const QDRANT_API_KEY = 'pyXAyyEPbLzba2RvdBwm';
			
 
				 const VECTOR_SIZE = 3072;
			
@@ -514,17 +516,17 @@ export function createProductText(product: any): string {
 
				     parts.push(product.title || product.name);
			
 
				   }
			
 
				 
			
 
				-  // Primary description (cleaned up HTML)
			
 
				+  // Primary description (cleaned up HTML with proper entity decoding and newline handling)
			
 
				   if (product.description) {
			
 
				-    const desc = product.description.replace(/<[^>]*>/g, '').trim();
			
 
				+    const desc = cleanHtmlContent(product.description);
			
 
				     if (desc) {
			
 
				       parts.push(desc);
			
 
				     }
			
 
				   }
			
 
				 
			
 
				-  // Short description (WooCommerce)
			
 
				+  // Short description (WooCommerce) - only add if different from main description
			
 
				   if (product.short_description && product.short_description !== product.description) {
			
 
				-    const shortDesc = product.short_description.replace(/<[^>]*>/g, '').trim();
			
 
				+    const shortDesc = cleanHtmlContent(product.short_description);
			
 
				     if (shortDesc) {
			
 
				       parts.push(shortDesc);
			
 
				     }
			
@@ -595,9 +597,9 @@ export function createProductText(product: any): string {
 
				     }
			
 
				   }
			
 
				 
			
 
				-  // Meta description (ShopRenter/SEO)
			
 
				+  // Meta description (ShopRenter/SEO) - only add if different from main description
			
 
				   if (product.meta_description && product.meta_description !== product.description) {
			
 
				-    const metaDesc = product.meta_description.replace(/<[^>]*>/g, '').trim();
			
 
				+    const metaDesc = cleanHtmlContent(product.meta_description);
			
 
				     if (metaDesc) {
			
 
				       parts.push(metaDesc);
			
 
				     }