refactored line converter, untied its logic from greentexting, better

handling of broken cases
2024-11-12 19:44:53 +00:00 · 2021-06-13 13:29:26 +03:00 · 2021-06-13 13:29:26 +03:00 · bebafa1a2c
commit bebafa1a2c
parent e825021ef1
3 changed files with 67 additions and 15 deletions
--- a/src/components/rich_content/rich_content.jsx
+++ b/src/components/rich_content/rich_content.jsx
@ -246,6 +246,7 @@ const getLinkData = (attrs, children, index) => {
 */
 export const preProcessPerLine = (html, greentext, handleLinks) => {
  const lastMentions = []
  const greentextHandle = new Set(['p', 'div'])
  let nonEmptyIndex = -1
  const newHtml = convertHtmlToLines(html).reverse().map((item, index, array) => {
@ -256,7 +257,14 @@ export const preProcessPerLine = (html, greentext, handleLinks) => {
    nonEmptyIndex += 1
    // Greentext stuff
-    if (greentext && (string.includes('&gt;') || string.includes('&lt;'))) {
+    if (
      // Only if greentext is engaged
      greentext &&
        // Only handle p's and divs. Don't want to affect blocquotes, code etc
        item.level.every(l => greentextHandle.has(l)) &&
        // Only if line begins with '>' or '<'
        (string.includes('&gt;') || string.includes('&lt;'))
    ) {
      const cleanedString = string.replace(/<[^>]+?>/gi, '') // remove all tags
        .replace(/@\w+/gi, '') // remove mentions (even failed ones)
        .trim()
--- a/src/services/html_converter/html_line_converter.service.js
+++ b/src/services/html_converter/html_line_converter.service.js
@ -19,9 +19,42 @@ import { getTagName } from './utility.service.js'
 * @return {(string|{ text: string })[]} processed html in form of a list.
 */
 export const convertHtmlToLines = (html) => {
-  const ignoredTags = new Set(['code', 'blockquote'])
+  // Elements that are implicitly self-closing
-  const handledTags = new Set(['p', 'br', 'div', 'pre', 'code', 'blockquote'])
+  // https://developer.mozilla.org/en-US/docs/Glossary/empty_element
-  const openCloseTags = new Set(['p', 'div', 'pre', 'code', 'blockquote'])
+  const emptyElements = new Set([
    'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
    'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
  ])
  // Block-level element (they make a visual line)
  // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
  const blockElements = new Set([
    'address', 'article', 'aside', 'blockquote', 'details', 'dialog', 'dd',
    'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'main',
    'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'
  ])
  // br is very weird in a way that it's technically not block-level, it's
  // essentially converted to a \n (or \r\n). There's also wbr but it doesn't
  // guarantee linebreak, only suggest it.
  const linebreakElements = new Set(['br'])
  const visualLineElements = new Set([
    ...blockElements.values(),
    ...linebreakElements.values()
  ])
  // All block-level elements that aren't empty elements, i.e. not <hr>
  const nonEmptyElements = new Set(visualLineElements)
  // Difference
  for (let elem of emptyElements) {
    nonEmptyElements.delete(elem)
  }
  // All elements that we are recognizing
  const allElements = new Set([
    ...nonEmptyElements.values(),
    ...emptyElements.values()
  ])
  let buffer = [] // Current output buffer
  const level = [] // How deep we are in tags and which tags were there
@ -29,8 +62,8 @@ export const convertHtmlToLines = (html) => {
  let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
  const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
-    if (textBuffer.trim().length > 0 && !level.some(l => ignoredTags.has(l))) {
+    if (textBuffer.trim().length > 0) {
-      buffer.push({ text: textBuffer })
+      buffer.push({ level: [...level], text: textBuffer })
    } else {
      buffer.push(textBuffer)
    }
@ -49,10 +82,12 @@ export const convertHtmlToLines = (html) => {
  }
  const handleClose = (tag) => { // handles closing tags
    flush()
    buffer.push(tag)
    if (level[0] === getTagName(tag)) {
      flush()
      buffer.push(tag)
      level.shift()
    } else { // Broken case
      textBuffer += tag
    }
  }
@ -67,10 +102,10 @@ export const convertHtmlToLines = (html) => {
      const tagFull = tagBuffer
      tagBuffer = null
      const tagName = getTagName(tagFull)
-      if (handledTags.has(tagName)) {
+      if (allElements.has(tagName)) {
-        if (tagName === 'br') {
+        if (linebreakElements.has(tagName)) {
          handleBr(tagFull)
-        } else if (openCloseTags.has(tagName)) {
+        } else if (nonEmptyElements.has(tagName)) {
          if (tagFull[1] === '/') {
            handleClose(tagFull)
          } else if (tagFull[tagFull.length - 2] === '/') {
--- a/test/unit/specs/services/html_converter/html_line_converter.spec.js
+++ b/test/unit/specs/services/html_converter/html_line_converter.spec.js
@ -1,8 +1,17 @@
 import { convertHtmlToLines } from 'src/services/html_converter/html_line_converter.service.js'
-const mapOnlyText = (processor) => (input) => input.text ? processor(input.text) : input
+const greentextHandle = new Set(['p', 'div'])
 const mapOnlyText = (processor) => (input) => {
  if (input.text && input.level.every(l => greentextHandle.has(l))) {
    return processor(input.text)
  } else if (input.text) {
    return input.text
  } else {
    return input
  }
 }
-describe('html_line_converter', () => {
+describe.only('html_line_converter', () => {
  describe('with processor that keeps original line should not make any changes to HTML when', () => {
    const processorKeep = (line) => line
    it('fed with regular HTML with newlines', () => {
@ -81,7 +90,7 @@ describe('html_line_converter', () => {
    it('fed with very broken HTML with broken composition', () => {
      const input = '</p> lmao what </div> whats going on <div> wha <p>'
-      const output = '</p>_</div>_<div>_<p>'
+      const output = '_<div>_<p>'
      const result = convertHtmlToLines(input)
      const comparableResult = result.map(mapOnlyText(processorReplace)).join('')
      expect(comparableResult).to.eql(output)
@ -111,7 +120,7 @@ describe('html_line_converter', () => {
      expect(comparableResult).to.eql(output)
    })
-    it('fed with maybe valid HTML? self-closing divs and ps', () => {
+    it('fed with maybe valid HTML? (XHTML) self-closing divs and ps', () => {
      const input = 'a <div class="what"/> what now <p aria-label="wtf"/> ?'
      const output = '_<div class="what"/>_<p aria-label="wtf"/>_'
      const result = convertHtmlToLines(input)