Parser for markdown documents (#6)

Closes #1

In this MR I implement only basic functionality

- [x] AXT Heading
- [x] Paragraph
- [x] Text Node
- [x] Softbreak
- [x] Links
- [x] List

Reviewed-on: #6
Co-authored-by: Dmitriy Pleshevskiy <dmitriy@ideascup.me>
Co-committed-by: Dmitriy Pleshevskiy <dmitriy@ideascup.me>
This commit is contained in:
Dmitriy Pleshevskiy 2022-06-13 14:58:22 +00:00 committed by Gitea
parent 6c175effb4
commit e8c6ce97c6
No known key found for this signature in database
GPG Key ID: 55B75599806CD426
6 changed files with 591 additions and 12 deletions

6
.gitignore vendored
View File

@ -9,5 +9,11 @@
!/import_map.json
!/core
/core/*
!/ren
/ren/*
!/par
/par/*
!/**/*.ts

260
par/md.test.ts Normal file
View File

@ -0,0 +1,260 @@
import { assertEquals } from "testing/asserts.ts";
import { HtmlStrRenderer } from "../ren/html_str.ts";
import { MarkdownParser } from "./md.ts";
const ren = new HtmlStrRenderer();
// Misc
Deno.test({
name: "should skip empty line",
fn: () => {
const par = new MarkdownParser();
assertEquals(ren.render(par.parse("\n")), "");
assertEquals(ren.render(par.parse("\r\n")), "");
assertEquals(ren.render(par.parse("\n\r\n")), "");
assertEquals(ren.render(par.parse("\n \n")), "");
},
});
// ATX Header
Deno.test({
name: "should parse empty ATX header",
fn: () => {
const par = new MarkdownParser();
const res = par.parse("#");
assertEquals(ren.render(res), "<h1></h1>");
},
});
Deno.test({
name: "should parse ATX header with text",
fn: () => {
const par = new MarkdownParser();
assertEquals(ren.render(par.parse("# hello")), "<h1>hello</h1>");
assertEquals(ren.render(par.parse("# hello#")), "<h1>hello#</h1>");
},
});
Deno.test({
name: "should parse ATX header with specific level",
fn: () => {
const par = new MarkdownParser();
assertEquals(ren.render(par.parse("# hello")), "<h1>hello</h1>");
assertEquals(ren.render(par.parse("## hello")), "<h2>hello</h2>");
assertEquals(ren.render(par.parse("### hello")), "<h3>hello</h3>");
assertEquals(ren.render(par.parse("#### hello")), "<h4>hello</h4>");
assertEquals(ren.render(par.parse("##### hello")), "<h5>hello</h5>");
assertEquals(ren.render(par.parse("###### hello")), "<h6>hello</h6>");
},
});
Deno.test({
name: "should parse ATX header if line contains additional spaces",
fn: () => {
const par = new MarkdownParser();
assertEquals(ren.render(par.parse(" # hello")), "<h1>hello</h1>");
assertEquals(ren.render(par.parse(" # hello")), "<h1>hello</h1>");
assertEquals(ren.render(par.parse(" # hello")), "<h1>hello</h1>");
},
});
Deno.test({
name: "should parse ATX header with closing sequence",
fn: () => {
const par = new MarkdownParser();
assertEquals(ren.render(par.parse("# #")), "<h1></h1>");
assertEquals(ren.render(par.parse("# hello #")), "<h1>hello</h1>");
assertEquals(ren.render(par.parse("# hello #########")), "<h1>hello</h1>");
assertEquals(ren.render(par.parse("# hello # ")), "<h1>hello</h1>");
assertEquals(ren.render(par.parse("###### hello #")), "<h6>hello</h6>");
},
});
Deno.test({
name: "should parse many headers with text",
fn: () => {
const par = new MarkdownParser();
const input = `\
# hello
## world
### this is
#### my world!`;
assertEquals(
ren.render(par.parse(input)),
"<h1>hello</h1><h2>world</h2><h3>this is</h3><h4>my world!</h4>",
);
},
});
// Paragraph
Deno.test({
name: "should parse paragraph",
fn: () => {
const par = new MarkdownParser();
assertEquals(ren.render(par.parse("hello")), "<p>hello</p>");
},
});
Deno.test({
name: "should parse paragraph with softbreak",
fn: () => {
const par = new MarkdownParser();
const input = `\
hello
world`;
assertEquals(ren.render(par.parse(input)), "<p>hello world</p>");
},
});
// Link
Deno.test({
name: "should parse link",
fn: () => {
const par = new MarkdownParser();
assertEquals(
ren.render(par.parse("[]()")),
'<p><a href="#"></a></p>',
);
assertEquals(
ren.render(par.parse("[hello]()")),
'<p><a href="#">hello</a></p>',
);
assertEquals(
ren.render(par.parse("[hello]()")),
'<p><a href="#">hello</a></p>',
);
},
});
Deno.test({
name: "should parse link destination",
fn: () => {
const par = new MarkdownParser();
assertEquals(
ren.render(par.parse("[](/hello)")),
'<p><a href="/hello"></a></p>',
);
assertEquals(
ren.render(par.parse("[](/hello?key=value&key2=value2)")),
'<p><a href="/hello?key=value&key2=value2"></a></p>',
);
assertEquals(
ren.render(par.parse("[hello](https://example.com)")),
'<p><a href="https://example.com">hello</a></p>',
);
assertEquals(
ren.render(par.parse("[hello](mailto:john@example.com)")),
'<p><a href="mailto:john@example.com">hello</a></p>',
);
assertEquals(
ren.render(par.parse("[](/привет)")),
'<p><a href="/%D0%BF%D1%80%D0%B8%D0%B2%D0%B5%D1%82"></a></p>',
);
assertEquals(
ren.render(par.parse("[](</hello world>)")),
'<p><a href="/hello%20world"></a></p>',
);
assertEquals(
ren.render(par.parse("[](</hello world?key=value value2&key2=value3>)")),
'<p><a href="/hello%20world?key=value%20value2&key2=value3"></a></p>',
);
},
});
Deno.test({
name: "should parse link title",
fn: () => {
const par = new MarkdownParser();
assertEquals(
ren.render(par.parse("[](/hello 'hello')")),
'<p><a href="/hello" title="hello"></a></p>',
);
assertEquals(
ren.render(par.parse('[hello](/hello "world")')),
'<p><a href="/hello" title="world">hello</a></p>',
);
assertEquals(
ren.render(par.parse('[hello](</hello world> "hello world")')),
'<p><a href="/hello%20world" title="hello world">hello</a></p>',
);
},
});
// List
Deno.test({
name: "should parse list with empty items",
fn: () => {
const par = new MarkdownParser();
assertEquals(
ren.render(par.parse("-")),
"<ul><li></li></ul>",
);
assertEquals(
ren.render(par.parse("- ")),
"<ul><li></li></ul>",
);
},
});
Deno.test({
name: "should parse list if line contains additional spaces",
fn: () => {
const expected = "<ul><li>hello</li></ul>";
const par = new MarkdownParser();
assertEquals(ren.render(par.parse(" - hello")), expected);
assertEquals(ren.render(par.parse(" - hello")), expected);
assertEquals(ren.render(par.parse(" - hello")), expected);
},
});
Deno.test({
name: "should not display a single paragraph in the list",
fn: () => {
const par = new MarkdownParser();
assertEquals(
ren.render(par.parse("- hello")),
"<ul><li>hello</li></ul>",
);
assertEquals(
ren.render(par.parse(`\
- hello
world`)),
"<ul><li>hello world</li></ul>",
);
assertEquals(
ren.render(par.parse(`\
- hello
world`)),
"<ul><li>hello world</li></ul>",
);
},
});
Deno.test({
name: "should parse many items in the list",
fn: () => {
const par = new MarkdownParser();
assertEquals(
ren.render(par.parse(`\
- hello
- world`)),
"<ul><li>hello</li><li>world</li></ul>",
);
},
});

292
par/md.ts Normal file
View File

@ -0,0 +1,292 @@
import { AnyNode, Elem, Fragment, TextNode } from "../core/node.ts";
import { isNil, Nilable } from "../core/utils.ts";
import { Parser } from "./types.ts";
const RE_EMPTY_LINE = /^\s*$/;
const RE_OPEN_ATX_HEADING = /^\s{0,3}(#{1,6})(\s|$)/;
const RE_CLOSE_ATX_HEADING = /(^|\s+)#*\s*$/;
const RE_LIST_ITEM = /^\s{0,3}([-+*])(\s|$)/;
// TODO: make better regex for destination
const RE_LINK = /\[([\s\S]*?)]\((?:([^\s]*)|<(.+?)>)(?: ('|")(.+?)\4)?\)/;
export class MarkdownParser implements Parser {
parse(input: string): AnyNode {
const astDoc: AstDocument = { kind: AstKind.Document, content: [] };
let readStr = input;
while (readStr.length) {
const newReadStr = skipEmptyLine(readStr) ??
parseAtxHeading(astDoc, readStr) ??
parseList(astDoc, readStr) ??
parseParagraph(astDoc, readStr);
if (isNil(newReadStr)) break;
readStr = newReadStr;
}
return new Fragment(astDoc.content.map(DocChild));
}
}
function List(ast: AstList): Elem {
// switch (ast.kind)
return BulletList(ast);
}
function BulletList(ast: AstBulletList): Elem {
return new Elem("ul", {}, ast.content.map(ListItem));
}
function ListItem(ast: AstListItem): Elem {
return new Elem(
"li",
{},
ast.content.length === 1 && ast.content[0].kind === AstKind.Paragraph
? ast.content[0].content.map(InlineContent)
: ast.content.map(DocChild),
);
}
function DocChild(ast: AstDocumentChild): Elem {
switch (ast.kind) {
case AstKind.AtxHeading:
return Heading(ast);
case AstKind.Paragraph:
return Paragraph(ast);
case AstKind.List:
return List(ast);
}
}
function Heading(ast: AstAtxHeading): Elem {
return new Elem(`h${ast.level}`, {}, ast.content.map(InlineContent));
}
function Paragraph(ast: AstParagraph): Elem {
return new Elem("p", {}, ast.content.map(InlineContent));
}
function InlineContent(ast: AstInlineContent): AnyNode {
return ast.kind === AstKind.Link ? Link(ast) : Text(ast);
}
function Link(ast: AstLink): Elem {
const attrs: Record<string, string> = { href: ast.destination || "#" };
if (ast.title) attrs.title = ast.title;
return new Elem("a", attrs, ast.content.map(Text));
}
function Text(ast: AstText): TextNode {
return new TextNode(ast.content);
}
// parse utils
function skipEmptyLine(readStr: string): string | null {
const match = RE_EMPTY_LINE.exec(readStr);
if (isNil(match)) return null;
return readStr.slice(match[0].length);
}
function parseAtxHeading(ast: AstDocument, readStr: string): string | null {
const match = RE_OPEN_ATX_HEADING.exec(readStr);
if (isNil(match)) return null;
readStr = readStr.slice(match[0].length);
const atxHeading: AstAtxHeading = {
kind: AstKind.AtxHeading,
level: match[1].length as HeadingLevel,
content: [],
};
ast.content.push(atxHeading);
if (match[2].length === 0) return readStr;
const endMatch = RE_CLOSE_ATX_HEADING.exec(readStr);
const headingInlineContent = !isNil(endMatch)
? readStr.slice(0, endMatch.index)
: readStr.includes("\n")
? readStr.slice(0, readStr.indexOf("\n") + 1)
: readStr;
parseInlineContent(atxHeading, headingInlineContent);
return readStr.slice(
headingInlineContent.length + (endMatch?.[0].length ?? 0),
);
}
function parseList(ast: AstDocument, readStr: string): string | null {
if (!readStr.length) return null;
let listMatch = RE_LIST_ITEM.exec(readStr);
if (isNil(listMatch)) return null;
const astList: AstBulletList = {
kind: AstKind.List,
type: AstListType.Bullet,
bulletChar: listMatch[1] as ListBulletChar,
content: [],
};
ast.content.push(astList);
do {
const astListItem: AstListItem = {
kind: AstKind.ListItem,
content: [],
};
astList.content.push(astListItem);
readStr = readStr.slice(listMatch[0].length);
const newReadStr = // parseAtxHeading(astList, readStr) ??
// parseList(astList, readStr) ??
parseParagraph(astListItem, readStr);
if (isNil(newReadStr)) break;
readStr = newReadStr;
listMatch = RE_LIST_ITEM.exec(readStr);
} while (!isNil(listMatch));
return readStr;
}
function parseParagraph(
ast: AstDocument | AstListItem,
readStr: string,
): string | null {
if (!readStr.length) return null;
const paragraph: AstParagraph = {
kind: AstKind.Paragraph,
content: [],
};
ast.content.push(paragraph);
let paragraphInlineContent = "";
while (!RE_EMPTY_LINE.test(readStr)) {
const listMatch = RE_LIST_ITEM.exec(readStr);
if (!isNil(listMatch)) break;
paragraphInlineContent += readStr.includes("\n")
? readStr.slice(0, readStr.indexOf("\n") + 1)
: readStr;
readStr = readStr.slice(paragraphInlineContent.length);
}
if (paragraphInlineContent.length) {
parseInlineContent(paragraph, paragraphInlineContent);
}
return readStr;
}
function parseInlineContent(
ast: AstAtxHeading | AstParagraph,
readStr: string,
): string | null {
if (!readStr.length) return null;
const linkMatch = RE_LINK.exec(readStr);
if (!isNil(linkMatch)) {
const astLink: AstLink = {
kind: AstKind.Link,
destination: encodeURI(linkMatch[3] ?? linkMatch[2]),
title: linkMatch[5],
content: [],
};
// 1. parse before link
parseText(ast, readStr.slice(0, linkMatch.index));
// 2. create link and parse inner content for link
ast.content.push(astLink);
parseText(astLink, linkMatch[1]);
// 3. parse rest text
return parseInlineContent(
ast,
readStr.slice(linkMatch.index + linkMatch[0].length),
);
} else {
return parseText(ast, readStr);
}
}
function parseText(
ast: AstAtxHeading | AstParagraph | AstLink,
readStr: string,
): string | null {
if (!readStr.length) return null;
const parts = readStr.split("\n").filter(Boolean).map(
(textPart): AstText => ({
kind: AstKind.Text,
content: textPart.trimStart(),
}),
);
ast.content.push(...parts);
return "";
}
// AST
type AstDocument = BaseAstItem<AstKind.Document, AstDocumentChild[]>;
type AstDocumentChild = AstAtxHeading | AstBulletList | AstParagraph | AstList;
type AstList = AstBulletList; // | AstOrderedList
enum AstListType {
Bullet,
// Ordered,
}
type ListBulletChar = "-" | "+" | "*";
type AstListItem = BaseAstItem<AstKind.ListItem, AstListItemChild[]>;
type AstListItemChild = AstDocumentChild;
interface AstAtxHeading
extends BaseAstItem<AstKind.AtxHeading, AstInlineContent[]> {
level: HeadingLevel;
}
type HeadingLevel = 1 | 2 | 3 | 4 | 5 | 6;
interface AstBulletList extends BaseAstItem<AstKind.List, AstListItem[]> {
type: AstListType.Bullet;
bulletChar: ListBulletChar;
}
type AstParagraph = BaseAstItem<AstKind.Paragraph, AstInlineContent[]>;
type AstInlineContent = AstText | AstLink;
interface AstLink extends BaseAstItem<AstKind.Link, AstText[]> {
destination: string;
title: Nilable<string>;
}
type AstText = BaseAstItem<AstKind.Text, string>;
interface BaseAstItem<K extends AstKind, Cont> {
kind: K;
content: Cont;
}
enum AstKind {
Document,
AtxHeading,
Paragraph,
List,
ListItem,
Link,
Text,
}

5
par/types.ts Normal file
View File

@ -0,0 +1,5 @@
import { AnyNode } from "../core/node.ts";
export interface Parser {
parse(input: string): AnyNode;
}

View File

@ -19,24 +19,34 @@ Deno.test({
Deno.test({
name: "should render element",
fn: () => {
const el = E("p", [], "hello world");
const ren = new HtmlStrRenderer();
const res = ren.render(el);
assertEquals(res, "<p>hello world</p>");
assertEquals(ren.render(E("p", [])), "<p></p>");
assertEquals(ren.render(E("p", [], "hello world")), "<p>hello world</p>");
assertEquals(
ren.render(E("p", [], ["hello", "world"])),
"<p>hello world</p>",
);
assertEquals(
ren.render(E("p", [], [E("span", [], "hello"), E("span", [], "world")])),
"<p><span>hello</span><span>world</span></p>",
);
assertEquals(
ren.render(E("p", [], ["hello", E("span", [], "world")])),
"<p>hello <span>world</span></p>",
);
assertEquals(
ren.render(E("p", [], [E("span", [], "hello"), "world"])),
"<p><span>hello</span> world</p>",
);
},
});
Deno.test({
name: "should render empty fragment as empty string",
fn: () => {
const frag = F([]);
const ren = new HtmlStrRenderer();
const res = ren.render(frag);
assertEquals(res, "");
assertEquals(ren.render(F([])), "");
},
});
@ -52,7 +62,7 @@ Deno.test({
const ren = new HtmlStrRenderer();
const res = ren.render(frag);
assertEquals(res, 'hello world<div class="hello"></div><p>world</p>');
assertEquals(res, 'hello world <div class="hello"></div><p>world</p>');
},
});

View File

@ -79,7 +79,9 @@ function encodeHtmlFragment(
node: Fragment,
hooks: HtmlStrRendererHooks,
): string {
return concat(node.children.map((ch) => encodeAnyNode(ch, hooks)));
return concatEncodedNodes(
node.children.map((ch) => encodeAnyNode(ch, hooks)),
);
}
function encodeHtmlElement(
@ -90,7 +92,11 @@ function encodeHtmlElement(
if (isSelfClosedTagName(tagName)) return open;
const encodedChildren = children.map((ch) => encodeAnyNode(ch, hooks));
return `${open}${concat(encodedChildren)}</${tagName}>`;
return `${open}${concatEncodedNodes(encodedChildren)}</${tagName}>`;
}
function concatEncodedNodes(encodedChildren: string[]): string {
return join(" ", encodedChildren).replace(/>\s+?</g, "><");
}
function encodeAttrs(