|
1 | 1 | # go-tokenizer |
2 | 2 |
|
3 | | -General Tokenizer and Abstract Syntax Tree Generator |
| 3 | +A general-purpose tokenizer and Markdown parser with HTML rendering for Go. |
| 4 | + |
| 5 | +[](https://pkg.go.dev/github.com/mutablelogic/go-tokenizer) |
| 6 | + |
| 7 | +## Features |
| 8 | + |
| 9 | +- **Lexical Scanner**: Tokenizes text into identifiers, numbers, strings, operators, and punctuation |
| 10 | +- **Markdown Parser**: Converts Markdown text into an Abstract Syntax Tree (AST) |
| 11 | +- **HTML Renderer**: Renders Markdown AST to HTML with proper escaping |
| 12 | +- **Configurable**: Optional features like comment parsing, newline handling, and float parsing |
| 13 | + |
| 14 | +## Installation |
| 15 | + |
| 16 | +```bash |
| 17 | +go get github.com/mutablelogic/go-tokenizer |
| 18 | +``` |
| 19 | + |
| 20 | +Requires Go 1.23 or later. |
| 21 | + |
| 22 | +## Quick Start |
| 23 | + |
| 24 | +### Tokenizing Text |
| 25 | + |
| 26 | +```go |
| 27 | +package main |
| 28 | + |
| 29 | +import ( |
| 30 | + "fmt" |
| 31 | + "strings" |
| 32 | + |
| 33 | + "github.com/mutablelogic/go-tokenizer" |
| 34 | +) |
| 35 | + |
| 36 | +func main() { |
| 37 | + scanner := tokenizer.NewScanner(strings.NewReader("hello world 123"), tokenizer.Pos{}) |
| 38 | + for { |
| 39 | + tok := scanner.Next() |
| 40 | + if tok.Kind == tokenizer.EOF { |
| 41 | + break |
| 42 | + } |
| 43 | + fmt.Printf("%s: %q\n", tok.Kind, tok.Value) |
| 44 | + } |
| 45 | +} |
| 46 | +``` |
| 47 | + |
| 48 | +Output: |
| 49 | + |
| 50 | +```bash |
| 51 | +Ident: "hello" |
| 52 | +Space: " " |
| 53 | +Ident: "world" |
| 54 | +Space: " " |
| 55 | +NumberInteger: "123" |
| 56 | +``` |
| 57 | + |
| 58 | +### Parsing Markdown |
| 59 | + |
| 60 | +```go |
| 61 | +package main |
| 62 | + |
| 63 | +import ( |
| 64 | + "fmt" |
| 65 | + "strings" |
| 66 | + |
| 67 | + "github.com/mutablelogic/go-tokenizer" |
| 68 | + "github.com/mutablelogic/go-tokenizer/pkg/markdown" |
| 69 | + "github.com/mutablelogic/go-tokenizer/pkg/markdown/html" |
| 70 | +) |
| 71 | + |
| 72 | +func main() { |
| 73 | + input := `# Hello World |
| 74 | +
|
| 75 | +This is **bold** and _italic_ text. |
| 76 | +
|
| 77 | +- Item 1 |
| 78 | +- Item 2 |
| 79 | +- Item 3 |
| 80 | +` |
| 81 | + doc := markdown.Parse(strings.NewReader(input), tokenizer.Pos{}) |
| 82 | + output := html.RenderString(doc) |
| 83 | + fmt.Println(output) |
| 84 | +} |
| 85 | +``` |
| 86 | + |
| 87 | +Output: |
| 88 | + |
| 89 | +```html |
| 90 | +<h1>Hello World</h1><p>This is <strong>bold</strong> and <em>italic</em> text.</p><ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul> |
| 91 | +``` |
| 92 | + |
| 93 | +## Packages |
| 94 | + |
| 95 | +### `tokenizer` (root package) |
| 96 | + |
| 97 | +The lexical scanner that breaks input text into tokens. |
| 98 | + |
| 99 | +**Token Types:** |
| 100 | + |
| 101 | +- `Ident` - Identifiers (hello, world) |
| 102 | +- `NumberInteger`, `NumberFloat`, `NumberHex`, `NumberOctal`, `NumberBinary` - Numbers |
| 103 | +- `String`, `QuotedString` - String literals |
| 104 | +- `Hash`, `Asterisk`, `Underscore`, `Backtick`, `Tilde` - Special characters |
| 105 | +- `Space`, `Newline` - Whitespace |
| 106 | +- `Comment` - Comments (when enabled) |
| 107 | +- And more... |
| 108 | + |
| 109 | +**Scanner Features:** |
| 110 | + |
| 111 | +```go |
| 112 | +// Enable features with bitwise OR |
| 113 | +scanner := tokenizer.NewScanner(r, pos, |
| 114 | + tokenizer.HashComment | // # style comments |
| 115 | + tokenizer.LineComment | // // style comments |
| 116 | + tokenizer.BlockComment | // /* */ style comments |
| 117 | + tokenizer.NewlineToken | // Emit newlines as separate tokens |
| 118 | + tokenizer.UnderscoreToken | // Emit underscores as separate tokens |
| 119 | + tokenizer.NumberFloatToken, // Parse floating point numbers |
| 120 | +) |
| 121 | +``` |
| 122 | + |
| 123 | +### `pkg/ast` |
| 124 | + |
| 125 | +Defines the AST node types and tree traversal. |
| 126 | + |
| 127 | +```go |
| 128 | +// Node interface |
| 129 | +type Node interface { |
| 130 | + Kind() Kind |
| 131 | + Children() []Node |
| 132 | +} |
| 133 | + |
| 134 | +// Walk the AST |
| 135 | +ast.Walk(doc, func(node ast.Node, depth int) error { |
| 136 | + fmt.Printf("%s%s\n", strings.Repeat(" ", depth), node.Kind()) |
| 137 | + return nil |
| 138 | +}) |
| 139 | +``` |
| 140 | + |
| 141 | +### `pkg/markdown` |
| 142 | + |
| 143 | +Parses Markdown text into an AST. |
| 144 | + |
| 145 | +**Supported Syntax:** |
| 146 | + |
| 147 | +- Headings: `# H1` through `###### H6` |
| 148 | +- Paragraphs: Text separated by blank lines |
| 149 | +- Emphasis: `_italic_` or `*italic*` |
| 150 | +- Strong: `__bold__` or `**bold**` |
| 151 | +- Strikethrough: `~~deleted~~` |
| 152 | +- Inline code: `` `code` `` |
| 153 | +- Code blocks: ` ```language ... ``` ` |
| 154 | +- Links: `[text](url)` or `<url>` |
| 155 | +- Images: `` |
| 156 | +- Blockquotes: `> quoted text` |
| 157 | +- Unordered lists: `- item`, `* item`, or `+ item` |
| 158 | +- Ordered lists: `1. item` or `1) item` |
| 159 | +- Horizontal rules: `---`, `***`, or `___` |
| 160 | + |
| 161 | +### `pkg/markdown/html` |
| 162 | + |
| 163 | +Renders Markdown AST to HTML. |
| 164 | + |
| 165 | +```go |
| 166 | +// Render to string |
| 167 | +output := html.RenderString(doc) |
| 168 | + |
| 169 | +// Render to io.Writer with indentation |
| 170 | +renderer := html.NewRenderer(w).WithIndent(true) |
| 171 | +err := renderer.Render(doc) |
| 172 | +``` |
| 173 | + |
| 174 | +**Features:** |
| 175 | + |
| 176 | +- Proper HTML escaping for XSS prevention |
| 177 | +- Optional indented output for readability |
| 178 | +- Language classes on code blocks: `<code class="language-go">` |
| 179 | + |
| 180 | +## AST Node Types |
| 181 | + |
| 182 | +| Kind | Description | HTML Output | |
| 183 | +|------|-------------|-------------| |
| 184 | +| `Document` | Root node | (container) | |
| 185 | +| `Paragraph` | Text block | `<p>...</p>` | |
| 186 | +| `Heading` | H1-H6 | `<h1>...</h1>` | |
| 187 | +| `Text` | Plain text | (escaped text) | |
| 188 | +| `Emphasis` | Italic | `<em>...</em>` | |
| 189 | +| `Strong` | Bold | `<strong>...</strong>` | |
| 190 | +| `Strikethrough` | Deleted | `<del>...</del>` | |
| 191 | +| `Code` | Inline code | `<code>...</code>` | |
| 192 | +| `CodeBlock` | Fenced code | `<pre><code>...</code></pre>` | |
| 193 | +| `Link` | Hyperlink | `<a href="...">...</a>` | |
| 194 | +| `Image` | Image | `<img src="..." alt="..."/>` | |
| 195 | +| `Blockquote` | Quote | `<blockquote>...</blockquote>` | |
| 196 | +| `List` | Ordered/Unordered | `<ol>...</ol>` or `<ul>...</ul>` | |
| 197 | +| `ListItem` | List item | `<li>...</li>` | |
| 198 | +| `HorizontalRule` | Divider | `<hr/>` | |
| 199 | + |
| 200 | +## License |
| 201 | + |
| 202 | +Apache 2.0 - see [LICENSE](LICENSE) for details. |
0 commit comments