feat: move title extraction into commons package

Signed-off-by: Tilman Vatteroth <git@tilmanvatteroth.de>
2025-05-25 12:34:45 -04:00 · 2023-04-08 21:31:27 +02:00 · 2023-04-08 21:31:27 +02:00 · 3962cafa5d
commit 3962cafa5d
parent 8de8a50bec
6 changed files with 124 additions and 36 deletions
--- a/commons/src/index.ts
+++ b/commons/src/index.ts
@ -32,3 +32,5 @@ export * from './title-extraction/generate-note-title.js'
 export * from './title-extraction/types/iso6391.js'
 export * from './title-extraction/types/frontmatter.js'
 export * from './title-extraction/types/slide-show-options.js'
+
+export { extractFirstHeading } from './title-extraction/extract-first-heading.js'
--- a/commons/src/title-extraction/extract-first-heading.spec.ts
+++ b/commons/src/title-extraction/extract-first-heading.spec.ts
@ -0,0 +1,60 @@
+/*
+ * SPDX-FileCopyrightText: 2023 The HedgeDoc developers (see AUTHORS file)
+ *
+ * SPDX-License-Identifier: AGPL-3.0-only
+ */
+import { extractFirstHeading } from './extract-first-heading.js'
+import { describe, expect, it } from '@jest/globals'
+import { Document, Element, Text } from 'domhandler'
+
+describe('extract first heading', () => {
+  describe.each([1, 2, 3, 4, 5, 6])('h%d', (headlineIndex) => {
+    it('extracts plain text', () => {
+      const content = `headline${headlineIndex}`
+      const headline = new Element(`h${headlineIndex}`, {}, [new Text(content)])
+      const document = new Document([headline])
+      expect(extractFirstHeading(document)).toBe(content)
+    })
+
+    it("doesn't extract heading-anchor", () => {
+      const headline = new Element(`h${headlineIndex}`, {}, [
+        new Element('a', { class: 'class1 heading-anchor class2' }, [
+          new Text('invalid link content')
+        ])
+      ])
+      const document = new Document([headline])
+      expect(extractFirstHeading(document)).toBe('')
+    })
+
+    it('extracts nested texts', () => {
+      const headline = new Element(`h${headlineIndex}`, {}, [
+        new Element('a', {}, [
+          new Text('Valid'),
+          new Element('div', {}, [new Text('Text')]),
+          new Text(`${headlineIndex}`)
+        ])
+      ])
+      const document = new Document([headline])
+      expect(extractFirstHeading(document)).toBe(`ValidText${headlineIndex}`)
+    })
+
+    it('extracts image alt texts', () => {
+      const headline = new Element(`h${headlineIndex}`, {}, [
+        new Element('img', { alt: 'Image Alt' })
+      ])
+      const document = new Document([headline])
+      expect(extractFirstHeading(document)).toBe('Image Alt')
+    })
+
+    it('extracts only the first found headline', () => {
+      const headline1 = new Element(`h${headlineIndex}`, {}, [
+        new Text(`headline${headlineIndex}`)
+      ])
+      const headline2 = new Element(`h${headlineIndex}`, {}, [
+        new Text('headline1')
+      ])
+      const document = new Document([headline1, headline2])
+      expect(extractFirstHeading(document)).toBe(`headline${headlineIndex}`)
+    })
+  })
+})
--- a/commons/src/title-extraction/extract-first-heading.ts
+++ b/commons/src/title-extraction/extract-first-heading.ts
@ -0,0 +1,57 @@
+/*
+ * SPDX-FileCopyrightText: 2023 The HedgeDoc developers (see AUTHORS file)
+ *
+ * SPDX-License-Identifier: AGPL-3.0-only
+ */
+import { Element, isTag, isText, Node, NodeWithChildren } from 'domhandler'
+
+const headlineTagRegex = /^h[1-6]$/gi
+
+/**
+ * Extracts the text content of the first top level headline tag.
+ *
+ * @param nodes The node whose children should be checked for the headline
+ * @return the plain text representation of the first headline. {@code undefined} if no headline has been found.
+ */
+export function extractFirstHeading(
+  nodes: NodeWithChildren
+): string | undefined {
+  const foundHeadlineNode = checkNodesForHeadline(nodes.children)
+  if (!foundHeadlineNode) {
+    return
+  }
+  return extractInnerTextFromNode(foundHeadlineNode).trim()
+}
+
+function checkNodesForHeadline(nodes: Node[]): Node | undefined {
+  return nodes.find((node) => isTag(node) && node.name.match(headlineTagRegex))
+}
+
+function extractInnerTextFromNode(node: Node): string {
+  if (isText(node)) {
+    return node.nodeValue
+  } else if (isTag(node)) {
+    return extractInnerTextFromTag(node)
+  } else {
+    return ''
+  }
+}
+
+function extractInnerTextFromTag(node: Element): string {
+  if (
+    node.name === 'a' &&
+    findAttribute(node, 'class')?.value.split(' ').includes('heading-anchor')
+  ) {
+    return ''
+  } else if (node.name === 'img') {
+    return findAttribute(node, 'alt')?.value ?? ''
+  } else {
+    return node.children.reduce((state, child) => {
+      return state + extractInnerTextFromNode(child)
+    }, '')
+  }
+}
+
+function findAttribute(node: Element, attributeName: string) {
+  return node.attributes.find((attribute) => attribute.name === attributeName)
+}