4 months ago · 32a8d0e0c6
--- a/supabase/functions/_shared/html-cleaner.ts
+++ b/supabase/functions/_shared/html-cleaner.ts
@@ -74,33 +74,38 @@ function decodeHtmlEntities(text: string): string {
 
															  * Clean HTML content for plain text storage
														
 
															  *
														
 
															  * This function:
														
 
															- * - Converts <br>, <br/>, <br />, and </p> tags to newlines
														
 
															+ * - Decodes HTML entities FIRST (&lt;, &gt;, &nbsp;, &amp;, etc.)
														
 
															+ * - Converts <br>, <br/>, <br /> tags to newlines
														
 
															+ * - Converts </p> and block-level closing tags to newlines
														
 
															  * - Removes all other HTML tags
														
 
															- * - Decodes HTML entities (&nbsp;, &amp;, etc.)
														
 
															  * - Removes extra whitespace
														
 
															- * - Removes duplicate newlines (keeps max 2 consecutive newlines for paragraph separation)
														
 
															+ * - Removes duplicate newlines (keeps only single newlines)
														
 
															  * - Trims leading/trailing whitespace
														
 
															  *
														
 
															  * @param html - HTML string to clean
														
 
															- * @returns Plain text with preserved paragraph structure
														
 
															+ * @returns Plain text with single newlines preserved
														
 
															  */
														
 
															 export function cleanHtmlContent(html: string | null | undefined): string {
														
 
															   if (!html) return '';
														
 
															   let text = html;
														
 
															-  // Step 1: Replace block-level tags with newlines to preserve paragraph structure
														
 
															+  // Step 1: Decode HTML entities FIRST
														
 
															+  // This is critical because the input might have encoded HTML like &lt;p&gt; instead of <p>
														
 
															+  text = decodeHtmlEntities(text);
														
 
															+
														
 
															+  // Step 2: Replace block-level tags with newlines to preserve paragraph structure
														
 
															   // Convert <br>, <br/>, <br /> tags to newlines
														
 
															   text = text.replace(/<br\s*\/?>/gi, '\n');
														
 
															-  // Convert closing paragraph tags to double newlines (paragraph break)
														
 
															-  text = text.replace(/<\/p>/gi, '\n\n');
														
 
															+  // Convert closing paragraph tags to newlines (single newline, duplicates removed later)
														
 
															+  text = text.replace(/<\/p>/gi, '\n');
														
 
															-  // Convert closing div tags to double newlines
														
 
															-  text = text.replace(/<\/div>/gi, '\n\n');
														
 
															+  // Convert closing div tags to newlines
														
 
															+  text = text.replace(/<\/div>/gi, '\n');
														
 
															-  // Convert closing heading tags to double newlines
														
 
															-  text = text.replace(/<\/(h[1-6])>/gi, '\n\n');
														
 
															+  // Convert closing heading tags to newlines
														
 
															+  text = text.replace(/<\/(h[1-6])>/gi, '\n');
														
 
															   // Convert list items to newlines with bullet
														
 
															   text = text.replace(/<li[^>]*>/gi, '\n• ');
														
@@ -109,12 +114,9 @@ export function cleanHtmlContent(html: string | null | undefined): string {
 
															   // Convert closing list tags to newlines
														
 
															   text = text.replace(/<\/(ul|ol)>/gi, '\n');
														
 
															-  // Step 2: Remove all remaining HTML tags
														
 
															+  // Step 3: Remove all remaining HTML tags
														
 
															   text = text.replace(/<[^>]*>/g, '');
														
 
															-  // Step 3: Decode HTML entities
														
 
															-  text = decodeHtmlEntities(text);
														
 
															-
														
 
															   // Step 4: Normalize whitespace
														
 
															   // Replace tabs with spaces
														
 
															   text = text.replace(/\t/g, ' ');
														
@@ -128,8 +130,8 @@ export function cleanHtmlContent(html: string | null | undefined): string {
 
															   // Replace Mac line endings with Unix line endings
														
 
															   text = text.replace(/\r/g, '\n');
														
 
															-  // Step 5: Remove duplicate newlines (keep max 2 for paragraph separation)
														
 
															-  text = text.replace(/\n{3,}/g, '\n\n');
														
 
															+  // Step 5: Remove duplicate newlines (keep only single newlines as per requirement)
														
 
															+  text = text.replace(/\n{2,}/g, '\n');
														
 
															   // Step 6: Trim each line (remove leading/trailing spaces)
														
 
															   text = text.split('\n').map(line => line.trim()).join('\n');
														
@@ -187,7 +189,7 @@ export function testHtmlCleaner(): void {
 
															     },
														
 
															     {
														
 
															       input: '<p>First paragraph</p><p>Second paragraph</p>',
														
 
															-      expected: 'First paragraph\n\nSecond paragraph'
														
 
															+      expected: 'First paragraph\nSecond paragraph'
														
 
															     },
														
 
															     {
														
 
															       input: 'Line 1<br/>Line 2<br>Line 3',
														
@@ -196,6 +198,10 @@ export function testHtmlCleaner(): void {
 
															     {
														
 
															       input: '<ul><li>Item 1</li><li>Item 2</li></ul>',
														
 
															       expected: '• Item 1\n• Item 2'
														
 
															+    },
														
 
															+    {
														
 
															+      input: '&lt;p&gt;Encoded HTML entities&lt;/p&gt;&lt;br/&gt;Next line',
														
 
															+      expected: 'Encoded HTML entities\nNext line'
														
 
															     }
														
 
															   ];