From e8c6ce97c67f3c0d1a282176da526e55ce29c34d Mon Sep 17 00:00:00 2001 From: Dmitriy Pleshevskiy Date: Mon, 13 Jun 2022 14:58:22 +0000 Subject: [PATCH] Parser for markdown documents (#6) Closes #1 In this MR I implement only basic functionality - [x] AXT Heading - [x] Paragraph - [x] Text Node - [x] Softbreak - [x] Links - [x] List Reviewed-on: https://git.pleshevski.ru/pleshevskiy/paren/pulls/6 Co-authored-by: Dmitriy Pleshevskiy Co-committed-by: Dmitriy Pleshevskiy --- .gitignore | 6 + par/md.test.ts | 260 ++++++++++++++++++++++++++++++++++++++ par/md.ts | 292 +++++++++++++++++++++++++++++++++++++++++++ par/types.ts | 5 + ren/html_str.test.ts | 30 +++-- ren/html_str.ts | 10 +- 6 files changed, 591 insertions(+), 12 deletions(-) create mode 100644 par/md.test.ts create mode 100644 par/md.ts create mode 100644 par/types.ts diff --git a/.gitignore b/.gitignore index 8c55080..4634098 100644 --- a/.gitignore +++ b/.gitignore @@ -9,5 +9,11 @@ !/import_map.json !/core +/core/* !/ren +/ren/* +!/par +/par/* + +!/**/*.ts diff --git a/par/md.test.ts b/par/md.test.ts new file mode 100644 index 0000000..46dbf84 --- /dev/null +++ b/par/md.test.ts @@ -0,0 +1,260 @@ +import { assertEquals } from "testing/asserts.ts"; +import { HtmlStrRenderer } from "../ren/html_str.ts"; +import { MarkdownParser } from "./md.ts"; + +const ren = new HtmlStrRenderer(); + +// Misc + +Deno.test({ + name: "should skip empty line", + fn: () => { + const par = new MarkdownParser(); + assertEquals(ren.render(par.parse("\n")), ""); + assertEquals(ren.render(par.parse("\r\n")), ""); + assertEquals(ren.render(par.parse("\n\r\n")), ""); + assertEquals(ren.render(par.parse("\n \n")), ""); + }, +}); + +// ATX Header + +Deno.test({ + name: "should parse empty ATX header", + fn: () => { + const par = new MarkdownParser(); + const res = par.parse("#"); + assertEquals(ren.render(res), "

"); + }, +}); + +Deno.test({ + name: "should parse ATX header with text", + fn: () => { + const par = new MarkdownParser(); + assertEquals(ren.render(par.parse("# hello")), "

hello

"); + assertEquals(ren.render(par.parse("# hello#")), "

hello#

"); + }, +}); + +Deno.test({ + name: "should parse ATX header with specific level", + fn: () => { + const par = new MarkdownParser(); + assertEquals(ren.render(par.parse("# hello")), "

hello

"); + assertEquals(ren.render(par.parse("## hello")), "

hello

"); + assertEquals(ren.render(par.parse("### hello")), "

hello

"); + assertEquals(ren.render(par.parse("#### hello")), "

hello

"); + assertEquals(ren.render(par.parse("##### hello")), "
hello
"); + assertEquals(ren.render(par.parse("###### hello")), "
hello
"); + }, +}); + +Deno.test({ + name: "should parse ATX header if line contains additional spaces", + fn: () => { + const par = new MarkdownParser(); + assertEquals(ren.render(par.parse(" # hello")), "

hello

"); + assertEquals(ren.render(par.parse(" # hello")), "

hello

"); + assertEquals(ren.render(par.parse(" # hello")), "

hello

"); + }, +}); + +Deno.test({ + name: "should parse ATX header with closing sequence", + fn: () => { + const par = new MarkdownParser(); + assertEquals(ren.render(par.parse("# #")), "

"); + assertEquals(ren.render(par.parse("# hello #")), "

hello

"); + assertEquals(ren.render(par.parse("# hello #########")), "

hello

"); + assertEquals(ren.render(par.parse("# hello # ")), "

hello

"); + assertEquals(ren.render(par.parse("###### hello #")), "
hello
"); + }, +}); + +Deno.test({ + name: "should parse many headers with text", + fn: () => { + const par = new MarkdownParser(); + + const input = `\ +# hello +## world +### this is +#### my world!`; + + assertEquals( + ren.render(par.parse(input)), + "

hello

world

this is

my world!

", + ); + }, +}); + +// Paragraph + +Deno.test({ + name: "should parse paragraph", + fn: () => { + const par = new MarkdownParser(); + + assertEquals(ren.render(par.parse("hello")), "

hello

"); + }, +}); + +Deno.test({ + name: "should parse paragraph with softbreak", + fn: () => { + const par = new MarkdownParser(); + + const input = `\ +hello +world`; + + assertEquals(ren.render(par.parse(input)), "

hello world

"); + }, +}); + +// Link + +Deno.test({ + name: "should parse link", + fn: () => { + const par = new MarkdownParser(); + + assertEquals( + ren.render(par.parse("[]()")), + '

', + ); + assertEquals( + ren.render(par.parse("[hello]()")), + '

hello

', + ); + assertEquals( + ren.render(par.parse("[hello]()")), + '

hello

', + ); + }, +}); + +Deno.test({ + name: "should parse link destination", + fn: () => { + const par = new MarkdownParser(); + + assertEquals( + ren.render(par.parse("[](/hello)")), + '

', + ); + assertEquals( + ren.render(par.parse("[](/hello?key=value&key2=value2)")), + '

', + ); + assertEquals( + ren.render(par.parse("[hello](https://example.com)")), + '

hello

', + ); + assertEquals( + ren.render(par.parse("[hello](mailto:john@example.com)")), + '

hello

', + ); + assertEquals( + ren.render(par.parse("[](/привет)")), + '

', + ); + assertEquals( + ren.render(par.parse("[]()")), + '

', + ); + assertEquals( + ren.render(par.parse("[]()")), + '

', + ); + }, +}); + +Deno.test({ + name: "should parse link title", + fn: () => { + const par = new MarkdownParser(); + + assertEquals( + ren.render(par.parse("[](/hello 'hello')")), + '

', + ); + assertEquals( + ren.render(par.parse('[hello](/hello "world")')), + '

hello

', + ); + assertEquals( + ren.render(par.parse('[hello]( "hello world")')), + '

hello

', + ); + }, +}); + +// List + +Deno.test({ + name: "should parse list with empty items", + fn: () => { + const par = new MarkdownParser(); + + assertEquals( + ren.render(par.parse("-")), + "
", + ); + assertEquals( + ren.render(par.parse("- ")), + "
", + ); + }, +}); + +Deno.test({ + name: "should parse list if line contains additional spaces", + fn: () => { + const expected = "
  • hello
"; + const par = new MarkdownParser(); + assertEquals(ren.render(par.parse(" - hello")), expected); + assertEquals(ren.render(par.parse(" - hello")), expected); + assertEquals(ren.render(par.parse(" - hello")), expected); + }, +}); + +Deno.test({ + name: "should not display a single paragraph in the list", + fn: () => { + const par = new MarkdownParser(); + + assertEquals( + ren.render(par.parse("- hello")), + "
  • hello
", + ); + assertEquals( + ren.render(par.parse(`\ +- hello +world`)), + "
  • hello world
", + ); + assertEquals( + ren.render(par.parse(`\ +- hello + world`)), + "
  • hello world
", + ); + }, +}); + +Deno.test({ + name: "should parse many items in the list", + fn: () => { + const par = new MarkdownParser(); + + assertEquals( + ren.render(par.parse(`\ +- hello +- world`)), + "
  • hello
  • world
", + ); + }, +}); diff --git a/par/md.ts b/par/md.ts new file mode 100644 index 0000000..78987fc --- /dev/null +++ b/par/md.ts @@ -0,0 +1,292 @@ +import { AnyNode, Elem, Fragment, TextNode } from "../core/node.ts"; +import { isNil, Nilable } from "../core/utils.ts"; +import { Parser } from "./types.ts"; + +const RE_EMPTY_LINE = /^\s*$/; + +const RE_OPEN_ATX_HEADING = /^\s{0,3}(#{1,6})(\s|$)/; +const RE_CLOSE_ATX_HEADING = /(^|\s+)#*\s*$/; + +const RE_LIST_ITEM = /^\s{0,3}([-+*])(\s|$)/; + +// TODO: make better regex for destination +const RE_LINK = /\[([\s\S]*?)]\((?:([^\s]*)|<(.+?)>)(?: ('|")(.+?)\4)?\)/; + +export class MarkdownParser implements Parser { + parse(input: string): AnyNode { + const astDoc: AstDocument = { kind: AstKind.Document, content: [] }; + + let readStr = input; + while (readStr.length) { + const newReadStr = skipEmptyLine(readStr) ?? + parseAtxHeading(astDoc, readStr) ?? + parseList(astDoc, readStr) ?? + parseParagraph(astDoc, readStr); + if (isNil(newReadStr)) break; + readStr = newReadStr; + } + + return new Fragment(astDoc.content.map(DocChild)); + } +} + +function List(ast: AstList): Elem { + // switch (ast.kind) + return BulletList(ast); +} + +function BulletList(ast: AstBulletList): Elem { + return new Elem("ul", {}, ast.content.map(ListItem)); +} + +function ListItem(ast: AstListItem): Elem { + return new Elem( + "li", + {}, + ast.content.length === 1 && ast.content[0].kind === AstKind.Paragraph + ? ast.content[0].content.map(InlineContent) + : ast.content.map(DocChild), + ); +} + +function DocChild(ast: AstDocumentChild): Elem { + switch (ast.kind) { + case AstKind.AtxHeading: + return Heading(ast); + case AstKind.Paragraph: + return Paragraph(ast); + case AstKind.List: + return List(ast); + } +} + +function Heading(ast: AstAtxHeading): Elem { + return new Elem(`h${ast.level}`, {}, ast.content.map(InlineContent)); +} + +function Paragraph(ast: AstParagraph): Elem { + return new Elem("p", {}, ast.content.map(InlineContent)); +} + +function InlineContent(ast: AstInlineContent): AnyNode { + return ast.kind === AstKind.Link ? Link(ast) : Text(ast); +} + +function Link(ast: AstLink): Elem { + const attrs: Record = { href: ast.destination || "#" }; + if (ast.title) attrs.title = ast.title; + + return new Elem("a", attrs, ast.content.map(Text)); +} + +function Text(ast: AstText): TextNode { + return new TextNode(ast.content); +} + +// parse utils + +function skipEmptyLine(readStr: string): string | null { + const match = RE_EMPTY_LINE.exec(readStr); + if (isNil(match)) return null; + return readStr.slice(match[0].length); +} + +function parseAtxHeading(ast: AstDocument, readStr: string): string | null { + const match = RE_OPEN_ATX_HEADING.exec(readStr); + if (isNil(match)) return null; + + readStr = readStr.slice(match[0].length); + + const atxHeading: AstAtxHeading = { + kind: AstKind.AtxHeading, + level: match[1].length as HeadingLevel, + content: [], + }; + ast.content.push(atxHeading); + + if (match[2].length === 0) return readStr; + + const endMatch = RE_CLOSE_ATX_HEADING.exec(readStr); + + const headingInlineContent = !isNil(endMatch) + ? readStr.slice(0, endMatch.index) + : readStr.includes("\n") + ? readStr.slice(0, readStr.indexOf("\n") + 1) + : readStr; + + parseInlineContent(atxHeading, headingInlineContent); + + return readStr.slice( + headingInlineContent.length + (endMatch?.[0].length ?? 0), + ); +} + +function parseList(ast: AstDocument, readStr: string): string | null { + if (!readStr.length) return null; + + let listMatch = RE_LIST_ITEM.exec(readStr); + if (isNil(listMatch)) return null; + + const astList: AstBulletList = { + kind: AstKind.List, + type: AstListType.Bullet, + bulletChar: listMatch[1] as ListBulletChar, + content: [], + }; + ast.content.push(astList); + + do { + const astListItem: AstListItem = { + kind: AstKind.ListItem, + content: [], + }; + astList.content.push(astListItem); + + readStr = readStr.slice(listMatch[0].length); + + const newReadStr = // parseAtxHeading(astList, readStr) ?? + // parseList(astList, readStr) ?? + parseParagraph(astListItem, readStr); + if (isNil(newReadStr)) break; + readStr = newReadStr; + + listMatch = RE_LIST_ITEM.exec(readStr); + } while (!isNil(listMatch)); + + return readStr; +} + +function parseParagraph( + ast: AstDocument | AstListItem, + readStr: string, +): string | null { + if (!readStr.length) return null; + + const paragraph: AstParagraph = { + kind: AstKind.Paragraph, + content: [], + }; + ast.content.push(paragraph); + + let paragraphInlineContent = ""; + while (!RE_EMPTY_LINE.test(readStr)) { + const listMatch = RE_LIST_ITEM.exec(readStr); + if (!isNil(listMatch)) break; + paragraphInlineContent += readStr.includes("\n") + ? readStr.slice(0, readStr.indexOf("\n") + 1) + : readStr; + readStr = readStr.slice(paragraphInlineContent.length); + } + + if (paragraphInlineContent.length) { + parseInlineContent(paragraph, paragraphInlineContent); + } + + return readStr; +} + +function parseInlineContent( + ast: AstAtxHeading | AstParagraph, + readStr: string, +): string | null { + if (!readStr.length) return null; + + const linkMatch = RE_LINK.exec(readStr); + if (!isNil(linkMatch)) { + const astLink: AstLink = { + kind: AstKind.Link, + destination: encodeURI(linkMatch[3] ?? linkMatch[2]), + title: linkMatch[5], + content: [], + }; + + // 1. parse before link + parseText(ast, readStr.slice(0, linkMatch.index)); + + // 2. create link and parse inner content for link + ast.content.push(astLink); + parseText(astLink, linkMatch[1]); + + // 3. parse rest text + return parseInlineContent( + ast, + readStr.slice(linkMatch.index + linkMatch[0].length), + ); + } else { + return parseText(ast, readStr); + } +} + +function parseText( + ast: AstAtxHeading | AstParagraph | AstLink, + readStr: string, +): string | null { + if (!readStr.length) return null; + + const parts = readStr.split("\n").filter(Boolean).map( + (textPart): AstText => ({ + kind: AstKind.Text, + content: textPart.trimStart(), + }), + ); + + ast.content.push(...parts); + + return ""; +} + +// AST + +type AstDocument = BaseAstItem; +type AstDocumentChild = AstAtxHeading | AstBulletList | AstParagraph | AstList; + +type AstList = AstBulletList; // | AstOrderedList + +enum AstListType { + Bullet, + // Ordered, +} + +type ListBulletChar = "-" | "+" | "*"; + +type AstListItem = BaseAstItem; + +type AstListItemChild = AstDocumentChild; + +interface AstAtxHeading + extends BaseAstItem { + level: HeadingLevel; +} + +type HeadingLevel = 1 | 2 | 3 | 4 | 5 | 6; + +interface AstBulletList extends BaseAstItem { + type: AstListType.Bullet; + bulletChar: ListBulletChar; +} + +type AstParagraph = BaseAstItem; + +type AstInlineContent = AstText | AstLink; + +interface AstLink extends BaseAstItem { + destination: string; + title: Nilable; +} + +type AstText = BaseAstItem; + +interface BaseAstItem { + kind: K; + content: Cont; +} + +enum AstKind { + Document, + AtxHeading, + Paragraph, + List, + ListItem, + Link, + Text, +} diff --git a/par/types.ts b/par/types.ts new file mode 100644 index 0000000..ecbf894 --- /dev/null +++ b/par/types.ts @@ -0,0 +1,5 @@ +import { AnyNode } from "../core/node.ts"; + +export interface Parser { + parse(input: string): AnyNode; +} diff --git a/ren/html_str.test.ts b/ren/html_str.test.ts index 823dcb1..5156ad5 100644 --- a/ren/html_str.test.ts +++ b/ren/html_str.test.ts @@ -19,24 +19,34 @@ Deno.test({ Deno.test({ name: "should render element", fn: () => { - const el = E("p", [], "hello world"); - const ren = new HtmlStrRenderer(); - const res = ren.render(el); - assertEquals(res, "

hello world

"); + assertEquals(ren.render(E("p", [])), "

"); + assertEquals(ren.render(E("p", [], "hello world")), "

hello world

"); + assertEquals( + ren.render(E("p", [], ["hello", "world"])), + "

hello world

", + ); + assertEquals( + ren.render(E("p", [], [E("span", [], "hello"), E("span", [], "world")])), + "

helloworld

", + ); + assertEquals( + ren.render(E("p", [], ["hello", E("span", [], "world")])), + "

hello world

", + ); + assertEquals( + ren.render(E("p", [], [E("span", [], "hello"), "world"])), + "

hello world

", + ); }, }); Deno.test({ name: "should render empty fragment as empty string", fn: () => { - const frag = F([]); - const ren = new HtmlStrRenderer(); - const res = ren.render(frag); - - assertEquals(res, ""); + assertEquals(ren.render(F([])), ""); }, }); @@ -52,7 +62,7 @@ Deno.test({ const ren = new HtmlStrRenderer(); const res = ren.render(frag); - assertEquals(res, 'hello world

world

'); + assertEquals(res, 'hello world

world

'); }, }); diff --git a/ren/html_str.ts b/ren/html_str.ts index 5aa279c..acb261a 100644 --- a/ren/html_str.ts +++ b/ren/html_str.ts @@ -79,7 +79,9 @@ function encodeHtmlFragment( node: Fragment, hooks: HtmlStrRendererHooks, ): string { - return concat(node.children.map((ch) => encodeAnyNode(ch, hooks))); + return concatEncodedNodes( + node.children.map((ch) => encodeAnyNode(ch, hooks)), + ); } function encodeHtmlElement( @@ -90,7 +92,11 @@ function encodeHtmlElement( if (isSelfClosedTagName(tagName)) return open; const encodedChildren = children.map((ch) => encodeAnyNode(ch, hooks)); - return `${open}${concat(encodedChildren)}`; + return `${open}${concatEncodedNodes(encodedChildren)}`; +} + +function concatEncodedNodes(encodedChildren: string[]): string { + return join(" ", encodedChildren).replace(/>\s+?<"); } function encodeAttrs(