|
@@ -74,33 +74,38 @@ function decodeHtmlEntities(text: string): string {
|
|
|
* Clean HTML content for plain text storage
|
|
* Clean HTML content for plain text storage
|
|
|
*
|
|
*
|
|
|
* This function:
|
|
* This function:
|
|
|
- * - Converts <br>, <br/>, <br />, and </p> tags to newlines
|
|
|
|
|
|
|
+ * - Decodes HTML entities FIRST (<, >, , &, etc.)
|
|
|
|
|
+ * - Converts <br>, <br/>, <br /> tags to newlines
|
|
|
|
|
+ * - Converts </p> and block-level closing tags to newlines
|
|
|
* - Removes all other HTML tags
|
|
* - Removes all other HTML tags
|
|
|
- * - Decodes HTML entities ( , &, etc.)
|
|
|
|
|
* - Removes extra whitespace
|
|
* - Removes extra whitespace
|
|
|
- * - Removes duplicate newlines (keeps max 2 consecutive newlines for paragraph separation)
|
|
|
|
|
|
|
+ * - Removes duplicate newlines (keeps only single newlines)
|
|
|
* - Trims leading/trailing whitespace
|
|
* - Trims leading/trailing whitespace
|
|
|
*
|
|
*
|
|
|
* @param html - HTML string to clean
|
|
* @param html - HTML string to clean
|
|
|
- * @returns Plain text with preserved paragraph structure
|
|
|
|
|
|
|
+ * @returns Plain text with single newlines preserved
|
|
|
*/
|
|
*/
|
|
|
export function cleanHtmlContent(html: string | null | undefined): string {
|
|
export function cleanHtmlContent(html: string | null | undefined): string {
|
|
|
if (!html) return '';
|
|
if (!html) return '';
|
|
|
|
|
|
|
|
let text = html;
|
|
let text = html;
|
|
|
|
|
|
|
|
- // Step 1: Replace block-level tags with newlines to preserve paragraph structure
|
|
|
|
|
|
|
+ // Step 1: Decode HTML entities FIRST
|
|
|
|
|
+ // This is critical because the input might have encoded HTML like <p> instead of <p>
|
|
|
|
|
+ text = decodeHtmlEntities(text);
|
|
|
|
|
+
|
|
|
|
|
+ // Step 2: Replace block-level tags with newlines to preserve paragraph structure
|
|
|
// Convert <br>, <br/>, <br /> tags to newlines
|
|
// Convert <br>, <br/>, <br /> tags to newlines
|
|
|
text = text.replace(/<br\s*\/?>/gi, '\n');
|
|
text = text.replace(/<br\s*\/?>/gi, '\n');
|
|
|
|
|
|
|
|
- // Convert closing paragraph tags to double newlines (paragraph break)
|
|
|
|
|
- text = text.replace(/<\/p>/gi, '\n\n');
|
|
|
|
|
|
|
+ // Convert closing paragraph tags to newlines (single newline, duplicates removed later)
|
|
|
|
|
+ text = text.replace(/<\/p>/gi, '\n');
|
|
|
|
|
|
|
|
- // Convert closing div tags to double newlines
|
|
|
|
|
- text = text.replace(/<\/div>/gi, '\n\n');
|
|
|
|
|
|
|
+ // Convert closing div tags to newlines
|
|
|
|
|
+ text = text.replace(/<\/div>/gi, '\n');
|
|
|
|
|
|
|
|
- // Convert closing heading tags to double newlines
|
|
|
|
|
- text = text.replace(/<\/(h[1-6])>/gi, '\n\n');
|
|
|
|
|
|
|
+ // Convert closing heading tags to newlines
|
|
|
|
|
+ text = text.replace(/<\/(h[1-6])>/gi, '\n');
|
|
|
|
|
|
|
|
// Convert list items to newlines with bullet
|
|
// Convert list items to newlines with bullet
|
|
|
text = text.replace(/<li[^>]*>/gi, '\n• ');
|
|
text = text.replace(/<li[^>]*>/gi, '\n• ');
|
|
@@ -109,12 +114,9 @@ export function cleanHtmlContent(html: string | null | undefined): string {
|
|
|
// Convert closing list tags to newlines
|
|
// Convert closing list tags to newlines
|
|
|
text = text.replace(/<\/(ul|ol)>/gi, '\n');
|
|
text = text.replace(/<\/(ul|ol)>/gi, '\n');
|
|
|
|
|
|
|
|
- // Step 2: Remove all remaining HTML tags
|
|
|
|
|
|
|
+ // Step 3: Remove all remaining HTML tags
|
|
|
text = text.replace(/<[^>]*>/g, '');
|
|
text = text.replace(/<[^>]*>/g, '');
|
|
|
|
|
|
|
|
- // Step 3: Decode HTML entities
|
|
|
|
|
- text = decodeHtmlEntities(text);
|
|
|
|
|
-
|
|
|
|
|
// Step 4: Normalize whitespace
|
|
// Step 4: Normalize whitespace
|
|
|
// Replace tabs with spaces
|
|
// Replace tabs with spaces
|
|
|
text = text.replace(/\t/g, ' ');
|
|
text = text.replace(/\t/g, ' ');
|
|
@@ -128,8 +130,8 @@ export function cleanHtmlContent(html: string | null | undefined): string {
|
|
|
// Replace Mac line endings with Unix line endings
|
|
// Replace Mac line endings with Unix line endings
|
|
|
text = text.replace(/\r/g, '\n');
|
|
text = text.replace(/\r/g, '\n');
|
|
|
|
|
|
|
|
- // Step 5: Remove duplicate newlines (keep max 2 for paragraph separation)
|
|
|
|
|
- text = text.replace(/\n{3,}/g, '\n\n');
|
|
|
|
|
|
|
+ // Step 5: Remove duplicate newlines (keep only single newlines as per requirement)
|
|
|
|
|
+ text = text.replace(/\n{2,}/g, '\n');
|
|
|
|
|
|
|
|
// Step 6: Trim each line (remove leading/trailing spaces)
|
|
// Step 6: Trim each line (remove leading/trailing spaces)
|
|
|
text = text.split('\n').map(line => line.trim()).join('\n');
|
|
text = text.split('\n').map(line => line.trim()).join('\n');
|
|
@@ -187,7 +189,7 @@ export function testHtmlCleaner(): void {
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
input: '<p>First paragraph</p><p>Second paragraph</p>',
|
|
input: '<p>First paragraph</p><p>Second paragraph</p>',
|
|
|
- expected: 'First paragraph\n\nSecond paragraph'
|
|
|
|
|
|
|
+ expected: 'First paragraph\nSecond paragraph'
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
input: 'Line 1<br/>Line 2<br>Line 3',
|
|
input: 'Line 1<br/>Line 2<br>Line 3',
|
|
@@ -196,6 +198,10 @@ export function testHtmlCleaner(): void {
|
|
|
{
|
|
{
|
|
|
input: '<ul><li>Item 1</li><li>Item 2</li></ul>',
|
|
input: '<ul><li>Item 1</li><li>Item 2</li></ul>',
|
|
|
expected: '• Item 1\n• Item 2'
|
|
expected: '• Item 1\n• Item 2'
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ input: '<p>Encoded HTML entities</p><br/>Next line',
|
|
|
|
|
+ expected: 'Encoded HTML entities\nNext line'
|
|
|
}
|
|
}
|
|
|
];
|
|
];
|
|
|
|
|
|