Browse Source

fix: decode HTML entities before cleaning tags in Qdrant sync #107

- Move HTML entity decoding to FIRST step in cleanHtmlContent()
- This fixes issue where &lt;p&gt; entities weren't being converted to <p> tags
- Change duplicate newline handling from max 2 to single newlines only
- Update function documentation to reflect changes
- Add test case for encoded HTML entities

Fixes issue where product descriptions stored in Qdrant contained
HTML special characters like &lt;p&gt; instead of properly cleaned text.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Claude 4 months ago
parent
commit
32a8d0e0c6
1 changed files with 24 additions and 18 deletions
  1. 24 18
      supabase/functions/_shared/html-cleaner.ts

+ 24 - 18
supabase/functions/_shared/html-cleaner.ts

@@ -74,33 +74,38 @@ function decodeHtmlEntities(text: string): string {
  * Clean HTML content for plain text storage
  * Clean HTML content for plain text storage
  *
  *
  * This function:
  * This function:
- * - Converts <br>, <br/>, <br />, and </p> tags to newlines
+ * - Decodes HTML entities FIRST (&lt;, &gt;, &nbsp;, &amp;, etc.)
+ * - Converts <br>, <br/>, <br /> tags to newlines
+ * - Converts </p> and block-level closing tags to newlines
  * - Removes all other HTML tags
  * - Removes all other HTML tags
- * - Decodes HTML entities (&nbsp;, &amp;, etc.)
  * - Removes extra whitespace
  * - Removes extra whitespace
- * - Removes duplicate newlines (keeps max 2 consecutive newlines for paragraph separation)
+ * - Removes duplicate newlines (keeps only single newlines)
  * - Trims leading/trailing whitespace
  * - Trims leading/trailing whitespace
  *
  *
  * @param html - HTML string to clean
  * @param html - HTML string to clean
- * @returns Plain text with preserved paragraph structure
+ * @returns Plain text with single newlines preserved
  */
  */
 export function cleanHtmlContent(html: string | null | undefined): string {
 export function cleanHtmlContent(html: string | null | undefined): string {
   if (!html) return '';
   if (!html) return '';
 
 
   let text = html;
   let text = html;
 
 
-  // Step 1: Replace block-level tags with newlines to preserve paragraph structure
+  // Step 1: Decode HTML entities FIRST
+  // This is critical because the input might have encoded HTML like &lt;p&gt; instead of <p>
+  text = decodeHtmlEntities(text);
+
+  // Step 2: Replace block-level tags with newlines to preserve paragraph structure
   // Convert <br>, <br/>, <br /> tags to newlines
   // Convert <br>, <br/>, <br /> tags to newlines
   text = text.replace(/<br\s*\/?>/gi, '\n');
   text = text.replace(/<br\s*\/?>/gi, '\n');
 
 
-  // Convert closing paragraph tags to double newlines (paragraph break)
-  text = text.replace(/<\/p>/gi, '\n\n');
+  // Convert closing paragraph tags to newlines (single newline, duplicates removed later)
+  text = text.replace(/<\/p>/gi, '\n');
 
 
-  // Convert closing div tags to double newlines
-  text = text.replace(/<\/div>/gi, '\n\n');
+  // Convert closing div tags to newlines
+  text = text.replace(/<\/div>/gi, '\n');
 
 
-  // Convert closing heading tags to double newlines
-  text = text.replace(/<\/(h[1-6])>/gi, '\n\n');
+  // Convert closing heading tags to newlines
+  text = text.replace(/<\/(h[1-6])>/gi, '\n');
 
 
   // Convert list items to newlines with bullet
   // Convert list items to newlines with bullet
   text = text.replace(/<li[^>]*>/gi, '\n• ');
   text = text.replace(/<li[^>]*>/gi, '\n• ');
@@ -109,12 +114,9 @@ export function cleanHtmlContent(html: string | null | undefined): string {
   // Convert closing list tags to newlines
   // Convert closing list tags to newlines
   text = text.replace(/<\/(ul|ol)>/gi, '\n');
   text = text.replace(/<\/(ul|ol)>/gi, '\n');
 
 
-  // Step 2: Remove all remaining HTML tags
+  // Step 3: Remove all remaining HTML tags
   text = text.replace(/<[^>]*>/g, '');
   text = text.replace(/<[^>]*>/g, '');
 
 
-  // Step 3: Decode HTML entities
-  text = decodeHtmlEntities(text);
-
   // Step 4: Normalize whitespace
   // Step 4: Normalize whitespace
   // Replace tabs with spaces
   // Replace tabs with spaces
   text = text.replace(/\t/g, ' ');
   text = text.replace(/\t/g, ' ');
@@ -128,8 +130,8 @@ export function cleanHtmlContent(html: string | null | undefined): string {
   // Replace Mac line endings with Unix line endings
   // Replace Mac line endings with Unix line endings
   text = text.replace(/\r/g, '\n');
   text = text.replace(/\r/g, '\n');
 
 
-  // Step 5: Remove duplicate newlines (keep max 2 for paragraph separation)
-  text = text.replace(/\n{3,}/g, '\n\n');
+  // Step 5: Remove duplicate newlines (keep only single newlines as per requirement)
+  text = text.replace(/\n{2,}/g, '\n');
 
 
   // Step 6: Trim each line (remove leading/trailing spaces)
   // Step 6: Trim each line (remove leading/trailing spaces)
   text = text.split('\n').map(line => line.trim()).join('\n');
   text = text.split('\n').map(line => line.trim()).join('\n');
@@ -187,7 +189,7 @@ export function testHtmlCleaner(): void {
     },
     },
     {
     {
       input: '<p>First paragraph</p><p>Second paragraph</p>',
       input: '<p>First paragraph</p><p>Second paragraph</p>',
-      expected: 'First paragraph\n\nSecond paragraph'
+      expected: 'First paragraph\nSecond paragraph'
     },
     },
     {
     {
       input: 'Line 1<br/>Line 2<br>Line 3',
       input: 'Line 1<br/>Line 2<br>Line 3',
@@ -196,6 +198,10 @@ export function testHtmlCleaner(): void {
     {
     {
       input: '<ul><li>Item 1</li><li>Item 2</li></ul>',
       input: '<ul><li>Item 1</li><li>Item 2</li></ul>',
       expected: '• Item 1\n• Item 2'
       expected: '• Item 1\n• Item 2'
+    },
+    {
+      input: '&lt;p&gt;Encoded HTML entities&lt;/p&gt;&lt;br/&gt;Next line',
+      expected: 'Encoded HTML entities\nNext line'
     }
     }
   ];
   ];