markdown.py
Python script, ASCII text executable
1
"""
2
This is a parser for a Markdown-like language, but it isn't compatible with
3
the CommonMark specification; check doc/enduser/Formatting messages.md for
4
its syntax.
5
6
Roundabout - git hosting for everyone <https://roundabout-host.com>
7
Copyright (C) 2023-2025 Roundabout developers <root@roundabout-host.com>
8
9
This program is free software: you can redistribute it and/or modify
10
it under the terms of the GNU Affero General Public License as published by
11
the Free Software Foundation, either version 3 of the License, or
12
(at your option) any later version.
13
14
This program is distributed in the hope that it will be useful,
15
but WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
GNU Affero General Public License for more details.
18
19
You should have received a copy of the GNU Affero General Public License
20
along with this program. If not, see <http://www.gnu.org/licenses/>.
21
"""
22
23
24
import re
25
import bs4 as beautifulsoup
26
import sys
27
28
29
def only_chars(string, chars):
30
chars = set(chars)
31
all_chars = set(string)
32
return all_chars.issubset(chars)
33
34
35
inline_regex = r"""
36
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
37
|
38
<(?P<urlDestination2>[^<>]*)> # autolink
39
|
40
(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side
41
|
42
(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side
43
|
44
[``] (?P<textCode2>(?:\\[``]|[^``])*) [``] # inline code (2 backticks)
45
|
46
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
47
|
48
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
49
|
50
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
51
"""
52
53
54
def leading(string, character):
55
return len(string) - len(string.lstrip(character))
56
57
58
def trailing(string, character):
59
return len(string) - len(string.rstrip(character))
60
61
62
class Element:
63
def __init__(self):
64
self.classes = []
65
self.content = None
66
pass
67
68
def __repr__(self):
69
return "Void block"
70
71
@property
72
def tag_name(self):
73
return "m-void"
74
75
76
class Container(Element):
77
def __init__(self, content):
78
super().__init__()
79
self.content = parse_line(content)
80
81
def __repr__(self):
82
return "Generic container element: " + repr(self.content)
83
84
85
class Rule(Element):
86
def __init__(self):
87
super().__init__()
88
89
def __repr__(self):
90
return "Rule"
91
92
@property
93
def tag_name(self):
94
return "hr"
95
96
97
class HardBreak(Element):
98
def __init__(self):
99
super().__init__()
100
101
def __repr__(self):
102
return "Hard break"
103
104
@property
105
def tag_name(self):
106
return "br"
107
108
109
class Heading(Container):
110
def __init__(self, content, level):
111
super().__init__(content)
112
self.level = level
113
pass
114
115
def __repr__(self):
116
return f"Heading level {self.level}:\n\t" + repr(self.content)
117
118
@property
119
def tag_name(self):
120
return "h" + str(self.level)
121
122
123
class Paragraph(Container):
124
def __init__(self, content):
125
super().__init__("")
126
self.content = parse_line(content)
127
128
def __repr__(self):
129
return "Paragraph:\n\t" + repr(self.content)
130
131
@property
132
def tag_name(self):
133
return "p"
134
135
136
class CodeBlock(Element):
137
def __init__(self, content, language="text"):
138
super().__init__()
139
self.content = content
140
self.language = language
141
142
def __repr__(self):
143
return f"Code block ({self.language}):\n\t" + repr(self.content)
144
145
@property
146
def tag_name(self):
147
return "pre"
148
149
150
class UnorderedList(Element):
151
def __init__(self, content):
152
super().__init__()
153
self.content = content
154
155
def __repr__(self):
156
return "Unordered list:\n\t" + repr(self.content)
157
158
@property
159
def tag_name(self):
160
return "ul"
161
162
163
class OrderedList(Element):
164
def __init__(self, content):
165
super().__init__()
166
self.content = content
167
168
def __repr__(self):
169
return "Ordered list:\n\t" + repr(self.content)
170
171
@property
172
def tag_name(self):
173
return "ol"
174
175
176
class ListItem(Element):
177
def __init__(self, content):
178
super().__init__()
179
self.content = tokenise(content)
180
181
def __repr__(self):
182
return "List item:\n\t" + repr(self.content)
183
184
@property
185
def tag_name(self):
186
return "li"
187
188
189
class Blockquote(Paragraph):
190
def __init__(self, content):
191
super().__init__("")
192
self.content = tokenise(content)
193
194
def __repr__(self):
195
return "Blockquote:\n\t" + repr(self.content)
196
197
@property
198
def tag_name(self):
199
return "blockquote"
200
201
202
class Emphasis(Container):
203
def __init__(self, content, value):
204
super().__init__(content)
205
self.value = value
206
if value >= 4:
207
self.classes.append("emphasis-3")
208
if value % 4 >= 2:
209
self.classes.append("emphasis-2")
210
if value % 2:
211
self.classes.append("emphasis-1")
212
213
def __repr__(self):
214
return f"Emphasis ({self.value}): " + repr(self.content)
215
216
@property
217
def tag_name(self):
218
return "em" if self.value == 1 else "strong"
219
220
221
class Code(Element):
222
def __init__(self, content):
223
super().__init__()
224
self.content = [content]
225
226
def __repr__(self):
227
return f"Inline code: {self.content}"
228
229
@property
230
def tag_name(self):
231
return "code"
232
233
234
class Strikethrough(Container):
235
def __init__(self, content):
236
super().__init__(content)
237
238
def __repr__(self):
239
return f"Strikethrough: {repr(self.content)}"
240
241
@property
242
def tag_name(self):
243
return "s"
244
245
246
class Diff(Container):
247
def __init__(self, content, value):
248
super().__init__(content)
249
self.value = value
250
251
def __repr__(self):
252
return f"Diff ({self.value}): {self.content}"
253
254
@property
255
def tag_name(self):
256
return "ins" if self.value == "++" else "del"
257
258
259
class Link(Element):
260
def __init__(self, content, destination, image=False):
261
super().__init__()
262
self.content = parse_line(content)
263
self.destination = destination
264
self.image = image
265
266
def __repr__(self):
267
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
268
269
@property
270
def tag_name(self):
271
return "a"
272
273
274
class Image(Link):
275
def __init__(self, text, destination):
276
super().__init__(text, destination, True)
277
278
@property
279
def tag_name(self):
280
return "img"
281
282
283
def parse_line(source):
284
if trailing(source, "\\") == 1:
285
source = source.rstrip("\\")
286
hard_break = True
287
else:
288
hard_break = False
289
290
tokens = []
291
pattern = re.compile(inline_regex, re.MULTILINE | re.DOTALL | re.VERBOSE)
292
matches = pattern.finditer(source)
293
294
lookup = 0
295
for i in matches:
296
l = i.start()
297
r = i.end()
298
tokens.append(source[lookup:l])
299
300
lookup = r
301
302
if i.group("em"):
303
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
304
if i.group("em2"):
305
tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2"))))
306
if i.group("textCode"):
307
tokens.append(Code(i.group("textCode")))
308
if i.group("textCode2"):
309
tokens.append(Code(i.group("textCode2")))
310
if i.group("strike"):
311
tokens.append(Strikethrough(i.group("textStrike")))
312
if i.group("diff"):
313
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
314
if i.group("urlText"):
315
if i.group("imageFlag"):
316
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
317
else:
318
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
319
if i.group("urlDestination2"):
320
if "://" not in i.group("urlDestination2"):
321
url_text = i.group("urlDestination2").partition(":")[2] # remove tel, mailto, sms prefixes
322
url_destination = i.group("urlDestination2")
323
if url_destination.startswith("mailto:"):
324
url_destination = url_destination.replace("@", "@") # prevent email harvesting
325
url_text = url_text.replace("@", "@") # prevent protocol injection
326
else:
327
url_text = url_destination = i.group("urlDestination2")
328
329
tokens.append(Link(url_text, url_destination))
330
331
tokens.append(source[lookup:])
332
333
if hard_break:
334
tokens.append(HardBreak())
335
336
return tokens
337
338
339
def tokenise(source):
340
tokens = []
341
342
current_block = Element()
343
344
lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces
345
346
i = 0
347
while i < len(lines):
348
line = lines[i]
349
if not line.strip() or line.startswith(";"):
350
# Void block
351
352
tokens.append(current_block)
353
current_block = Element()
354
355
i += 1
356
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
357
# Horizontal rule
358
359
tokens.append(current_block)
360
current_block = Rule()
361
362
i += 1
363
elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
364
if not isinstance(current_block, UnorderedList):
365
tokens.append(current_block)
366
367
content = []
368
369
while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
370
inner_content = lines[i][2:].strip() + "\n" # discard marker and space
371
i += 1
372
while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1] == " "):
373
inner_content += lines[i] + "\n"
374
i += 1
375
376
content.append(ListItem(inner_content))
377
378
current_block = UnorderedList(content)
379
elif re.match(r"^\d+\.", line):
380
if not isinstance(current_block, UnorderedList):
381
tokens.append(current_block)
382
383
content = []
384
385
while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1:
386
inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period
387
i += 1
388
marker_length = len(lines[i].split(".", 1)[0]) + 1
389
while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]):
390
if re.match(r"^ \d+\.", lines[i]):
391
marker_length = 2
392
inner_content += lines[i][marker_length:] + "\n"
393
i += 1
394
395
content.append(ListItem(inner_content))
396
397
current_block = OrderedList(content)
398
elif line.startswith("#") and leading(line.lstrip("#"), " "):
399
tokens.append(current_block)
400
401
content = line.lstrip("#").strip()
402
current_block = Heading(content, leading(line, "#"))
403
404
i += 1
405
elif line.startswith(">"):
406
if not isinstance(current_block, Blockquote):
407
tokens.append(current_block)
408
409
content = ""
410
411
while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
412
content += lines[i].lstrip(">") + "\n"
413
i += 1
414
415
current_block = Blockquote(content)
416
elif leading(line, "~") == 3 or leading(line, "`") == 3:
417
if not isinstance(current_block, CodeBlock):
418
tokens.append(current_block)
419
420
language = line.lstrip("`~").strip()
421
422
content = ""
423
i += 1 # skip the opening fence
424
while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
425
content += lines[i] + "\n"
426
i += 1
427
428
if i < len(lines):
429
i += 1 # prevent a new block from beginning with the closing fence
430
431
current_block = CodeBlock(content, language=language)
432
elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip():
433
tokens.append(current_block)
434
435
content = line.strip()
436
current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2)
437
438
i += 2
439
else:
440
if not isinstance(current_block, Paragraph):
441
# Create a paragraph, if there is no other specifier
442
tokens.append(current_block)
443
444
content = ""
445
446
while (i < len(lines)
447
and not lines[i].startswith("#")
448
and not lines[i].startswith(">")
449
and not lines[i].startswith(";")
450
and not lines[i].startswith("* ")
451
and not lines[i].startswith("+ ")
452
and not lines[i].startswith("- ")
453
and not lines[i].startswith("~~~")
454
and not lines[i].startswith("```")
455
and not re.match(r"^\d+\.", lines[i])
456
and lines[i].strip()):
457
content += lines[i].strip() + "\n"
458
i += 1
459
460
current_block = Paragraph(content)
461
462
tokens.append(current_block)
463
464
return tokens
465
466
467
def make_html(ast):
468
soup = beautifulsoup.BeautifulSoup()
469
for i in ast:
470
# Use bs4 to generate HTML
471
if isinstance(i, str):
472
soup.append(i)
473
elif hasattr(i, "content") and i.tag_name != "m-void":
474
tag = soup.new_tag(str(i.tag_name))
475
if i.tag_name == "a":
476
tag["href"] = i.destination
477
if i.tag_name == "img":
478
tag["src"] = i.destination
479
tag["alt"] = " ".join(i.content)
480
if i.tag_name == "pre":
481
tag["data-language"] = i.language
482
if i.classes:
483
tag["class"] = " ".join(i.classes)
484
try:
485
if isinstance(i.content, list):
486
tag.append(make_html(i.content))
487
elif i.content and i.tag_name != "img":
488
tag.string = i.content
489
490
if i.tag_name == "img":
491
tag.string = ""
492
except AttributeError as exc:
493
# print(i)
494
print(exc, file=sys.stderr)
495
soup.append(tag)
496
return soup
497
498
499
def markdown2html(markdown):
500
return make_html(tokenise(markdown))
501
502