Parser for markdown documents (#6)

Closes #1 In this MR I implement only basic functionality - [x] AXT Heading - [x] Paragraph - [x] Text Node - [x] Softbreak - [x] Links - [x] List Reviewed-on: #6 Co-authored-by: Dmitriy Pleshevskiy <dmitriy@ideascup.me> Co-committed-by: Dmitriy Pleshevskiy <dmitriy@ideascup.me>
2022-06-13 14:58:22 +00:00 · 2022-06-13 14:58:22 +00:00 · e8c6ce97c6
commit e8c6ce97c6
parent 6c175effb4
6 changed files with 591 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,5 +9,11 @@
 !/import_map.json

 !/core
+/core/*
 !/ren
+/ren/*
+!/par
+/par/*
+
+!/**/*.ts

--- a/par/md.test.ts
+++ b/par/md.test.ts
@ -0,0 +1,260 @@
+import { assertEquals } from "testing/asserts.ts";
+import { HtmlStrRenderer } from "../ren/html_str.ts";
+import { MarkdownParser } from "./md.ts";
+
+const ren = new HtmlStrRenderer();
+
+// Misc
+
+Deno.test({
+  name: "should skip empty line",
+  fn: () => {
+    const par = new MarkdownParser();
+    assertEquals(ren.render(par.parse("\n")), "");
+    assertEquals(ren.render(par.parse("\r\n")), "");
+    assertEquals(ren.render(par.parse("\n\r\n")), "");
+    assertEquals(ren.render(par.parse("\n          \n")), "");
+  },
+});
+
+// ATX Header
+
+Deno.test({
+  name: "should parse empty ATX header",
+  fn: () => {
+    const par = new MarkdownParser();
+    const res = par.parse("#");
+    assertEquals(ren.render(res), "<h1></h1>");
+  },
+});
+
+Deno.test({
+  name: "should parse ATX header with text",
+  fn: () => {
+    const par = new MarkdownParser();
+    assertEquals(ren.render(par.parse("# hello")), "<h1>hello</h1>");
+    assertEquals(ren.render(par.parse("# hello#")), "<h1>hello#</h1>");
+  },
+});
+
+Deno.test({
+  name: "should parse ATX header with specific level",
+  fn: () => {
+    const par = new MarkdownParser();
+    assertEquals(ren.render(par.parse("# hello")), "<h1>hello</h1>");
+    assertEquals(ren.render(par.parse("## hello")), "<h2>hello</h2>");
+    assertEquals(ren.render(par.parse("### hello")), "<h3>hello</h3>");
+    assertEquals(ren.render(par.parse("#### hello")), "<h4>hello</h4>");
+    assertEquals(ren.render(par.parse("##### hello")), "<h5>hello</h5>");
+    assertEquals(ren.render(par.parse("###### hello")), "<h6>hello</h6>");
+  },
+});
+
+Deno.test({
+  name: "should parse ATX header if line contains additional spaces",
+  fn: () => {
+    const par = new MarkdownParser();
+    assertEquals(ren.render(par.parse(" # hello")), "<h1>hello</h1>");
+    assertEquals(ren.render(par.parse("  # hello")), "<h1>hello</h1>");
+    assertEquals(ren.render(par.parse("   # hello")), "<h1>hello</h1>");
+  },
+});
+
+Deno.test({
+  name: "should parse ATX header with closing sequence",
+  fn: () => {
+    const par = new MarkdownParser();
+    assertEquals(ren.render(par.parse("# #")), "<h1></h1>");
+    assertEquals(ren.render(par.parse("# hello #")), "<h1>hello</h1>");
+    assertEquals(ren.render(par.parse("# hello #########")), "<h1>hello</h1>");
+    assertEquals(ren.render(par.parse("# hello #        ")), "<h1>hello</h1>");
+    assertEquals(ren.render(par.parse("###### hello #")), "<h6>hello</h6>");
+  },
+});
+
+Deno.test({
+  name: "should parse many headers with text",
+  fn: () => {
+    const par = new MarkdownParser();
+
+    const input = `\
+# hello
+## world
+### this is
+#### my world!`;
+
+    assertEquals(
+      ren.render(par.parse(input)),
+      "<h1>hello</h1><h2>world</h2><h3>this is</h3><h4>my world!</h4>",
+    );
+  },
+});
+
+// Paragraph
+
+Deno.test({
+  name: "should parse paragraph",
+  fn: () => {
+    const par = new MarkdownParser();
+
+    assertEquals(ren.render(par.parse("hello")), "<p>hello</p>");
+  },
+});
+
+Deno.test({
+  name: "should parse paragraph with softbreak",
+  fn: () => {
+    const par = new MarkdownParser();
+
+    const input = `\
+hello
+world`;
+
+    assertEquals(ren.render(par.parse(input)), "<p>hello world</p>");
+  },
+});
+
+// Link
+
+Deno.test({
+  name: "should parse link",
+  fn: () => {
+    const par = new MarkdownParser();
+
+    assertEquals(
+      ren.render(par.parse("[]()")),
+      '<p><a href="#"></a></p>',
+    );
+    assertEquals(
+      ren.render(par.parse("[hello]()")),
+      '<p><a href="#">hello</a></p>',
+    );
+    assertEquals(
+      ren.render(par.parse("[hello]()")),
+      '<p><a href="#">hello</a></p>',
+    );
+  },
+});
+
+Deno.test({
+  name: "should parse link destination",
+  fn: () => {
+    const par = new MarkdownParser();
+
+    assertEquals(
+      ren.render(par.parse("[](/hello)")),
+      '<p><a href="/hello"></a></p>',
+    );
+    assertEquals(
+      ren.render(par.parse("[](/hello?key=value&key2=value2)")),
+      '<p><a href="/hello?key=value&key2=value2"></a></p>',
+    );
+    assertEquals(
+      ren.render(par.parse("[hello](https://example.com)")),
+      '<p><a href="https://example.com">hello</a></p>',
+    );
+    assertEquals(
+      ren.render(par.parse("[hello](mailto:john@example.com)")),
+      '<p><a href="mailto:john@example.com">hello</a></p>',
+    );
+    assertEquals(
+      ren.render(par.parse("[](/привет)")),
+      '<p><a href="/%D0%BF%D1%80%D0%B8%D0%B2%D0%B5%D1%82"></a></p>',
+    );
+    assertEquals(
+      ren.render(par.parse("[](</hello world>)")),
+      '<p><a href="/hello%20world"></a></p>',
+    );
+    assertEquals(
+      ren.render(par.parse("[](</hello world?key=value value2&key2=value3>)")),
+      '<p><a href="/hello%20world?key=value%20value2&key2=value3"></a></p>',
+    );
+  },
+});
+
+Deno.test({
+  name: "should parse link title",
+  fn: () => {
+    const par = new MarkdownParser();
+
+    assertEquals(
+      ren.render(par.parse("[](/hello 'hello')")),
+      '<p><a href="/hello" title="hello"></a></p>',
+    );
+    assertEquals(
+      ren.render(par.parse('[hello](/hello "world")')),
+      '<p><a href="/hello" title="world">hello</a></p>',
+    );
+    assertEquals(
+      ren.render(par.parse('[hello](</hello world> "hello world")')),
+      '<p><a href="/hello%20world" title="hello world">hello</a></p>',
+    );
+  },
+});
+
+// List
+
+Deno.test({
+  name: "should parse list with empty items",
+  fn: () => {
+    const par = new MarkdownParser();
+
+    assertEquals(
+      ren.render(par.parse("-")),
+      "<ul><li></li></ul>",
+    );
+    assertEquals(
+      ren.render(par.parse("- ")),
+      "<ul><li></li></ul>",
+    );
+  },
+});
+
+Deno.test({
+  name: "should parse list if line contains additional spaces",
+  fn: () => {
+    const expected = "<ul><li>hello</li></ul>";
+    const par = new MarkdownParser();
+    assertEquals(ren.render(par.parse(" - hello")), expected);
+    assertEquals(ren.render(par.parse("  - hello")), expected);
+    assertEquals(ren.render(par.parse("   - hello")), expected);
+  },
+});
+
+Deno.test({
+  name: "should not display a single paragraph in the list",
+  fn: () => {
+    const par = new MarkdownParser();
+
+    assertEquals(
+      ren.render(par.parse("- hello")),
+      "<ul><li>hello</li></ul>",
+    );
+    assertEquals(
+      ren.render(par.parse(`\
+- hello
+world`)),
+      "<ul><li>hello world</li></ul>",
+    );
+    assertEquals(
+      ren.render(par.parse(`\
+- hello
+  world`)),
+      "<ul><li>hello world</li></ul>",
+    );
+  },
+});
+
+Deno.test({
+  name: "should parse many items in the list",
+  fn: () => {
+    const par = new MarkdownParser();
+
+    assertEquals(
+      ren.render(par.parse(`\
+- hello
+- world`)),
+      "<ul><li>hello</li><li>world</li></ul>",
+    );
+  },
+});
--- a/par/md.ts
+++ b/par/md.ts
@ -0,0 +1,292 @@
+import { AnyNode, Elem, Fragment, TextNode } from "../core/node.ts";
+import { isNil, Nilable } from "../core/utils.ts";
+import { Parser } from "./types.ts";
+
+const RE_EMPTY_LINE = /^\s*$/;
+
+const RE_OPEN_ATX_HEADING = /^\s{0,3}(#{1,6})(\s|$)/;
+const RE_CLOSE_ATX_HEADING = /(^|\s+)#*\s*$/;
+
+const RE_LIST_ITEM = /^\s{0,3}([-+*])(\s|$)/;
+
+// TODO: make better regex for destination
+const RE_LINK = /\[([\s\S]*?)]\((?:([^\s]*)|<(.+?)>)(?: ('|")(.+?)\4)?\)/;
+
+export class MarkdownParser implements Parser {
+  parse(input: string): AnyNode {
+    const astDoc: AstDocument = { kind: AstKind.Document, content: [] };
+
+    let readStr = input;
+    while (readStr.length) {
+      const newReadStr = skipEmptyLine(readStr) ??
+        parseAtxHeading(astDoc, readStr) ??
+        parseList(astDoc, readStr) ??
+        parseParagraph(astDoc, readStr);
+      if (isNil(newReadStr)) break;
+      readStr = newReadStr;
+    }
+
+    return new Fragment(astDoc.content.map(DocChild));
+  }
+}
+
+function List(ast: AstList): Elem {
+  // switch (ast.kind)
+  return BulletList(ast);
+}
+
+function BulletList(ast: AstBulletList): Elem {
+  return new Elem("ul", {}, ast.content.map(ListItem));
+}
+
+function ListItem(ast: AstListItem): Elem {
+  return new Elem(
+    "li",
+    {},
+    ast.content.length === 1 && ast.content[0].kind === AstKind.Paragraph
+      ? ast.content[0].content.map(InlineContent)
+      : ast.content.map(DocChild),
+  );
+}
+
+function DocChild(ast: AstDocumentChild): Elem {
+  switch (ast.kind) {
+    case AstKind.AtxHeading:
+      return Heading(ast);
+    case AstKind.Paragraph:
+      return Paragraph(ast);
+    case AstKind.List:
+      return List(ast);
+  }
+}
+
+function Heading(ast: AstAtxHeading): Elem {
+  return new Elem(`h${ast.level}`, {}, ast.content.map(InlineContent));
+}
+
+function Paragraph(ast: AstParagraph): Elem {
+  return new Elem("p", {}, ast.content.map(InlineContent));
+}
+
+function InlineContent(ast: AstInlineContent): AnyNode {
+  return ast.kind === AstKind.Link ? Link(ast) : Text(ast);
+}
+
+function Link(ast: AstLink): Elem {
+  const attrs: Record<string, string> = { href: ast.destination || "#" };
+  if (ast.title) attrs.title = ast.title;
+
+  return new Elem("a", attrs, ast.content.map(Text));
+}
+
+function Text(ast: AstText): TextNode {
+  return new TextNode(ast.content);
+}
+
+// parse utils
+
+function skipEmptyLine(readStr: string): string | null {
+  const match = RE_EMPTY_LINE.exec(readStr);
+  if (isNil(match)) return null;
+  return readStr.slice(match[0].length);
+}
+
+function parseAtxHeading(ast: AstDocument, readStr: string): string | null {
+  const match = RE_OPEN_ATX_HEADING.exec(readStr);
+  if (isNil(match)) return null;
+
+  readStr = readStr.slice(match[0].length);
+
+  const atxHeading: AstAtxHeading = {
+    kind: AstKind.AtxHeading,
+    level: match[1].length as HeadingLevel,
+    content: [],
+  };
+  ast.content.push(atxHeading);
+
+  if (match[2].length === 0) return readStr;
+
+  const endMatch = RE_CLOSE_ATX_HEADING.exec(readStr);
+
+  const headingInlineContent = !isNil(endMatch)
+    ? readStr.slice(0, endMatch.index)
+    : readStr.includes("\n")
+    ? readStr.slice(0, readStr.indexOf("\n") + 1)
+    : readStr;
+
+  parseInlineContent(atxHeading, headingInlineContent);
+
+  return readStr.slice(
+    headingInlineContent.length + (endMatch?.[0].length ?? 0),
+  );
+}
+
+function parseList(ast: AstDocument, readStr: string): string | null {
+  if (!readStr.length) return null;
+
+  let listMatch = RE_LIST_ITEM.exec(readStr);
+  if (isNil(listMatch)) return null;
+
+  const astList: AstBulletList = {
+    kind: AstKind.List,
+    type: AstListType.Bullet,
+    bulletChar: listMatch[1] as ListBulletChar,
+    content: [],
+  };
+  ast.content.push(astList);
+
+  do {
+    const astListItem: AstListItem = {
+      kind: AstKind.ListItem,
+      content: [],
+    };
+    astList.content.push(astListItem);
+
+    readStr = readStr.slice(listMatch[0].length);
+
+    const newReadStr = // parseAtxHeading(astList, readStr) ??
+      // parseList(astList, readStr) ??
+      parseParagraph(astListItem, readStr);
+    if (isNil(newReadStr)) break;
+    readStr = newReadStr;
+
+    listMatch = RE_LIST_ITEM.exec(readStr);
+  } while (!isNil(listMatch));
+
+  return readStr;
+}
+
+function parseParagraph(
+  ast: AstDocument | AstListItem,
+  readStr: string,
+): string | null {
+  if (!readStr.length) return null;
+
+  const paragraph: AstParagraph = {
+    kind: AstKind.Paragraph,
+    content: [],
+  };
+  ast.content.push(paragraph);
+
+  let paragraphInlineContent = "";
+  while (!RE_EMPTY_LINE.test(readStr)) {
+    const listMatch = RE_LIST_ITEM.exec(readStr);
+    if (!isNil(listMatch)) break;
+    paragraphInlineContent += readStr.includes("\n")
+      ? readStr.slice(0, readStr.indexOf("\n") + 1)
+      : readStr;
+    readStr = readStr.slice(paragraphInlineContent.length);
+  }
+
+  if (paragraphInlineContent.length) {
+    parseInlineContent(paragraph, paragraphInlineContent);
+  }
+
+  return readStr;
+}
+
+function parseInlineContent(
+  ast: AstAtxHeading | AstParagraph,
+  readStr: string,
+): string | null {
+  if (!readStr.length) return null;
+
+  const linkMatch = RE_LINK.exec(readStr);
+  if (!isNil(linkMatch)) {
+    const astLink: AstLink = {
+      kind: AstKind.Link,
+      destination: encodeURI(linkMatch[3] ?? linkMatch[2]),
+      title: linkMatch[5],
+      content: [],
+    };
+
+    // 1. parse before link
+    parseText(ast, readStr.slice(0, linkMatch.index));
+
+    // 2. create link and parse inner content for link
+    ast.content.push(astLink);
+    parseText(astLink, linkMatch[1]);
+
+    // 3. parse rest text
+    return parseInlineContent(
+      ast,
+      readStr.slice(linkMatch.index + linkMatch[0].length),
+    );
+  } else {
+    return parseText(ast, readStr);
+  }
+}
+
+function parseText(
+  ast: AstAtxHeading | AstParagraph | AstLink,
+  readStr: string,
+): string | null {
+  if (!readStr.length) return null;
+
+  const parts = readStr.split("\n").filter(Boolean).map(
+    (textPart): AstText => ({
+      kind: AstKind.Text,
+      content: textPart.trimStart(),
+    }),
+  );
+
+  ast.content.push(...parts);
+
+  return "";
+}
+
+// AST
+
+type AstDocument = BaseAstItem<AstKind.Document, AstDocumentChild[]>;
+type AstDocumentChild = AstAtxHeading | AstBulletList | AstParagraph | AstList;
+
+type AstList = AstBulletList; // | AstOrderedList
+
+enum AstListType {
+  Bullet,
+  // Ordered,
+}
+
+type ListBulletChar = "-" | "+" | "*";
+
+type AstListItem = BaseAstItem<AstKind.ListItem, AstListItemChild[]>;
+
+type AstListItemChild = AstDocumentChild;
+
+interface AstAtxHeading
+  extends BaseAstItem<AstKind.AtxHeading, AstInlineContent[]> {
+  level: HeadingLevel;
+}
+
+type HeadingLevel = 1 | 2 | 3 | 4 | 5 | 6;
+
+interface AstBulletList extends BaseAstItem<AstKind.List, AstListItem[]> {
+  type: AstListType.Bullet;
+  bulletChar: ListBulletChar;
+}
+
+type AstParagraph = BaseAstItem<AstKind.Paragraph, AstInlineContent[]>;
+
+type AstInlineContent = AstText | AstLink;
+
+interface AstLink extends BaseAstItem<AstKind.Link, AstText[]> {
+  destination: string;
+  title: Nilable<string>;
+}
+
+type AstText = BaseAstItem<AstKind.Text, string>;
+
+interface BaseAstItem<K extends AstKind, Cont> {
+  kind: K;
+  content: Cont;
+}
+
+enum AstKind {
+  Document,
+  AtxHeading,
+  Paragraph,
+  List,
+  ListItem,
+  Link,
+  Text,
+}
--- a/par/types.ts
+++ b/par/types.ts
@ -0,0 +1,5 @@
+import { AnyNode } from "../core/node.ts";
+
+export interface Parser {
+  parse(input: string): AnyNode;
+}
--- a/ren/html_str.test.ts
+++ b/ren/html_str.test.ts
@ -19,24 +19,34 @@ Deno.test({
 Deno.test({
  name: "should render element",
  fn: () => {
-    const el = E("p", [], "hello world");
-
    const ren = new HtmlStrRenderer();
-    const res = ren.render(el);

-    assertEquals(res, "<p>hello world</p>");
+    assertEquals(ren.render(E("p", [])), "<p></p>");
+    assertEquals(ren.render(E("p", [], "hello world")), "<p>hello world</p>");
+    assertEquals(
+      ren.render(E("p", [], ["hello", "world"])),
+      "<p>hello world</p>",
+    );
+    assertEquals(
+      ren.render(E("p", [], [E("span", [], "hello"), E("span", [], "world")])),
+      "<p><span>hello</span><span>world</span></p>",
+    );
+    assertEquals(
+      ren.render(E("p", [], ["hello", E("span", [], "world")])),
+      "<p>hello <span>world</span></p>",
+    );
+    assertEquals(
+      ren.render(E("p", [], [E("span", [], "hello"), "world"])),
+      "<p><span>hello</span> world</p>",
+    );
  },
 });

 Deno.test({
  name: "should render empty fragment as empty string",
  fn: () => {
-    const frag = F([]);
-
    const ren = new HtmlStrRenderer();
-    const res = ren.render(frag);
-
-    assertEquals(res, "");
+    assertEquals(ren.render(F([])), "");
  },
 });

@ -52,7 +62,7 @@ Deno.test({
    const ren = new HtmlStrRenderer();
    const res = ren.render(frag);

-    assertEquals(res, 'hello world<div class="hello"></div><p>world</p>');
+    assertEquals(res, 'hello world <div class="hello"></div><p>world</p>');
  },
 });

--- a/ren/html_str.ts
+++ b/ren/html_str.ts
@ -79,7 +79,9 @@ function encodeHtmlFragment(
  node: Fragment,
  hooks: HtmlStrRendererHooks,
 ): string {
-  return concat(node.children.map((ch) => encodeAnyNode(ch, hooks)));
+  return concatEncodedNodes(
+    node.children.map((ch) => encodeAnyNode(ch, hooks)),
+  );
 }

 function encodeHtmlElement(
@ -90,7 +92,11 @@ function encodeHtmlElement(
  if (isSelfClosedTagName(tagName)) return open;

  const encodedChildren = children.map((ch) => encodeAnyNode(ch, hooks));
-  return `${open}${concat(encodedChildren)}</${tagName}>`;
+  return `${open}${concatEncodedNodes(encodedChildren)}</${tagName}>`;
+}
+
+function concatEncodedNodes(encodedChildren: string[]): string {
+  return join(" ", encodedChildren).replace(/>\s+?</g, "><");
 }

 function encodeAttrs(