By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/plain • 14.68 kiB
Python script, ASCII text executable
        
            
1
"""
2
This is a parser for a Markdown-like language, but it isn't compatible with
3
the CommonMark specification; check doc/enduser/Formatting messages.md for
4
its syntax.
5
6
Roundabout - git hosting for everyone <https://roundabout-host.com>
7
Copyright (C) 2023-2025 Roundabout developers <root@roundabout-host.com>
8
9
This program is free software: you can redistribute it and/or modify
10
it under the terms of the GNU Affero General Public License as published by
11
the Free Software Foundation, either version 3 of the License, or
12
(at your option) any later version.
13
14
This program is distributed in the hope that it will be useful,
15
but WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
GNU Affero General Public License for more details.
18
19
You should have received a copy of the GNU Affero General Public License
20
along with this program. If not, see <http://www.gnu.org/licenses/>.
21
"""
22
23
24
import re
25
import bs4 as beautifulsoup
26
import sys
27
28
29
def only_chars(string, chars):
30
chars = set(chars)
31
all_chars = set(string)
32
return all_chars.issubset(chars)
33
34
35
inline_regex = r"""
36
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
37
|
38
<(?P<urlDestination2>[^<>]*)> # autolink
39
|
40
(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side
41
|
42
(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side
43
|
44
[``] (?P<textCode2>(?:\\[``]|[^``])*) [``] # inline code (2 backticks)
45
|
46
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
47
|
48
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
49
|
50
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
51
"""
52
53
54
def leading(string, character):
55
return len(string) - len(string.lstrip(character))
56
57
58
def trailing(string, character):
59
return len(string) - len(string.rstrip(character))
60
61
62
class Element:
63
def __init__(self):
64
self.classes = []
65
self.content = None
66
pass
67
68
def __repr__(self):
69
return "Void block"
70
71
@property
72
def tag_name(self):
73
return "m-void"
74
75
76
class Container(Element):
77
def __init__(self, content):
78
super().__init__()
79
self.content = parse_line(content)
80
81
def __repr__(self):
82
return "Generic container element: " + repr(self.content)
83
84
85
class Rule(Element):
86
def __init__(self):
87
super().__init__()
88
89
def __repr__(self):
90
return "Rule"
91
92
@property
93
def tag_name(self):
94
return "hr"
95
96
97
class HardBreak(Element):
98
def __init__(self):
99
super().__init__()
100
101
def __repr__(self):
102
return "Hard break"
103
104
@property
105
def tag_name(self):
106
return "br"
107
108
109
class Heading(Container):
110
def __init__(self, content, level):
111
super().__init__(content)
112
self.level = level
113
pass
114
115
def __repr__(self):
116
return f"Heading level {self.level}:\n\t" + repr(self.content)
117
118
@property
119
def tag_name(self):
120
return "h" + str(self.level)
121
122
123
class Paragraph(Container):
124
def __init__(self, content):
125
super().__init__("")
126
self.content = parse_line(content)
127
128
def __repr__(self):
129
return "Paragraph:\n\t" + repr(self.content)
130
131
@property
132
def tag_name(self):
133
return "p"
134
135
136
class CodeBlock(Element):
137
def __init__(self, content, language="text"):
138
super().__init__()
139
self.content = content
140
self.language = language
141
142
def __repr__(self):
143
return f"Code block ({self.language}):\n\t" + repr(self.content)
144
145
@property
146
def tag_name(self):
147
return "pre"
148
149
150
class UnorderedList(Element):
151
def __init__(self, content):
152
super().__init__()
153
self.content = content
154
155
def __repr__(self):
156
return "Unordered list:\n\t" + repr(self.content)
157
158
@property
159
def tag_name(self):
160
return "ul"
161
162
163
class OrderedList(Element):
164
def __init__(self, content):
165
super().__init__()
166
self.content = content
167
168
def __repr__(self):
169
return "Ordered list:\n\t" + repr(self.content)
170
171
@property
172
def tag_name(self):
173
return "ol"
174
175
176
class ListItem(Element):
177
def __init__(self, content):
178
super().__init__()
179
self.content = tokenise(content)
180
181
def __repr__(self):
182
return "List item:\n\t" + repr(self.content)
183
184
@property
185
def tag_name(self):
186
return "li"
187
188
189
class Blockquote(Paragraph):
190
def __init__(self, content):
191
super().__init__("")
192
self.content = tokenise(content)
193
194
def __repr__(self):
195
return "Blockquote:\n\t" + repr(self.content)
196
197
@property
198
def tag_name(self):
199
return "blockquote"
200
201
202
class Emphasis(Container):
203
def __init__(self, content, value):
204
super().__init__(content)
205
self.value = value
206
if value >= 4:
207
self.classes.append("emphasis-3")
208
if value % 4 >= 2:
209
self.classes.append("emphasis-2")
210
if value % 2:
211
self.classes.append("emphasis-1")
212
213
def __repr__(self):
214
return f"Emphasis ({self.value}): " + repr(self.content)
215
216
@property
217
def tag_name(self):
218
return "em" if self.value == 1 else "strong"
219
220
221
class Code(Element):
222
def __init__(self, content):
223
super().__init__()
224
self.content = [content]
225
226
def __repr__(self):
227
return f"Inline code: {self.content}"
228
229
@property
230
def tag_name(self):
231
return "code"
232
233
234
class Strikethrough(Container):
235
def __init__(self, content):
236
super().__init__(content)
237
238
def __repr__(self):
239
return f"Strikethrough: {repr(self.content)}"
240
241
@property
242
def tag_name(self):
243
return "s"
244
245
246
class Diff(Container):
247
def __init__(self, content, value):
248
super().__init__(content)
249
self.value = value
250
251
def __repr__(self):
252
return f"Diff ({self.value}): {self.content}"
253
254
@property
255
def tag_name(self):
256
return "ins" if self.value == "++" else "del"
257
258
259
class Link(Element):
260
def __init__(self, content, destination, image=False):
261
super().__init__()
262
self.content = parse_line(content)
263
self.destination = destination
264
self.image = image
265
266
def __repr__(self):
267
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
268
269
@property
270
def tag_name(self):
271
return "a"
272
273
274
class Image(Link):
275
def __init__(self, text, destination):
276
super().__init__(text, destination, True)
277
278
@property
279
def tag_name(self):
280
return "img"
281
282
283
def parse_line(source):
284
if trailing(source, "\\") == 1:
285
source = source.rstrip("\\")
286
hard_break = True
287
else:
288
hard_break = False
289
290
tokens = []
291
pattern = re.compile(inline_regex, re.MULTILINE | re.DOTALL | re.VERBOSE)
292
matches = pattern.finditer(source)
293
294
lookup = 0
295
for i in matches:
296
l = i.start()
297
r = i.end()
298
tokens.append(source[lookup:l])
299
300
lookup = r
301
302
if i.group("em"):
303
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
304
if i.group("em2"):
305
tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2"))))
306
if i.group("textCode"):
307
tokens.append(Code(i.group("textCode")))
308
if i.group("textCode2"):
309
tokens.append(Code(i.group("textCode2")))
310
if i.group("strike"):
311
tokens.append(Strikethrough(i.group("textStrike")))
312
if i.group("diff"):
313
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
314
if i.group("urlText"):
315
if i.group("imageFlag"):
316
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
317
else:
318
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
319
if i.group("urlDestination2"):
320
if "://" not in i.group("urlDestination2"):
321
url_text = i.group("urlDestination2").partition(":")[2] # remove tel, mailto, sms prefixes
322
url_destination = i.group("urlDestination2")
323
if url_destination.startswith("mailto:"):
324
url_destination = url_destination.replace("@", "&#64;") # prevent email harvesting
325
url_text = url_text.replace("@", "&#64;") # prevent protocol injection
326
else:
327
url_text = url_destination = i.group("urlDestination2")
328
329
tokens.append(Link(url_text, url_destination))
330
331
tokens.append(source[lookup:])
332
333
if hard_break:
334
tokens.append(HardBreak())
335
336
return tokens
337
338
339
def tokenise(source):
340
tokens = []
341
342
current_block = Element()
343
344
lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces
345
346
i = 0
347
while i < len(lines):
348
line = lines[i]
349
if not line.strip() or line.startswith(";"):
350
# Void block
351
352
tokens.append(current_block)
353
current_block = Element()
354
355
i += 1
356
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
357
# Horizontal rule
358
359
tokens.append(current_block)
360
current_block = Rule()
361
362
i += 1
363
elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
364
if not isinstance(current_block, UnorderedList):
365
tokens.append(current_block)
366
367
content = []
368
369
while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
370
inner_content = lines[i][2:].strip() + "\n" # discard marker and space
371
i += 1
372
while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1] == " "):
373
inner_content += lines[i] + "\n"
374
i += 1
375
376
content.append(ListItem(inner_content))
377
378
current_block = UnorderedList(content)
379
elif re.match(r"^\d+\.", line):
380
if not isinstance(current_block, UnorderedList):
381
tokens.append(current_block)
382
383
content = []
384
385
while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1:
386
inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period
387
i += 1
388
marker_length = len(lines[i].split(".", 1)[0]) + 1
389
while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]):
390
inner_content += lines[i][2:] + "\n"
391
i += 1
392
393
content.append(ListItem(inner_content))
394
395
current_block = OrderedList(content)
396
elif line.startswith("#") and leading(line.lstrip("#"), " "):
397
tokens.append(current_block)
398
399
content = line.lstrip("#").strip()
400
current_block = Heading(content, leading(line, "#"))
401
402
i += 1
403
elif line.startswith(">"):
404
if not isinstance(current_block, Blockquote):
405
tokens.append(current_block)
406
407
content = ""
408
409
while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
410
content += lines[i].lstrip(">") + "\n"
411
i += 1
412
413
current_block = Blockquote(content)
414
elif leading(line, "~") == 3 or leading(line, "`") == 3:
415
if not isinstance(current_block, CodeBlock):
416
tokens.append(current_block)
417
418
language = line.lstrip("`~").strip()
419
420
content = ""
421
i += 1 # skip the opening fence
422
while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
423
content += lines[i] + "\n"
424
i += 1
425
426
if i < len(lines):
427
i += 1 # prevent a new block from beginning with the closing fence
428
429
current_block = CodeBlock(content, language=language)
430
elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip():
431
tokens.append(current_block)
432
433
content = line.strip()
434
current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2)
435
436
i += 2
437
else:
438
if not isinstance(current_block, Paragraph):
439
# Create a paragraph, if there is no other specifier
440
tokens.append(current_block)
441
442
content = ""
443
444
while (i < len(lines)
445
and not lines[i].startswith("#")
446
and not lines[i].startswith(">")
447
and not lines[i].startswith(";")
448
and not lines[i].startswith("* ")
449
and not lines[i].startswith("+ ")
450
and not lines[i].startswith("- ")
451
and not lines[i].startswith("~~~")
452
and not lines[i].startswith("```")
453
and not re.match(r"^\d+\.", lines[i])
454
and lines[i].strip()):
455
content += lines[i].strip() + "\n"
456
i += 1
457
458
current_block = Paragraph(content)
459
460
tokens.append(current_block)
461
462
return tokens
463
464
465
def make_html(ast):
466
soup = beautifulsoup.BeautifulSoup()
467
for i in ast:
468
# Use bs4 to generate HTML
469
if isinstance(i, str):
470
soup.append(i)
471
elif hasattr(i, "content") and i.tag_name != "m-void":
472
tag = soup.new_tag(str(i.tag_name))
473
if i.tag_name == "a":
474
tag["href"] = i.destination
475
if i.tag_name == "img":
476
tag["src"] = i.destination
477
tag["alt"] = " ".join(i.content)
478
if i.tag_name == "pre":
479
tag["data-language"] = i.language
480
if i.classes:
481
tag["class"] = " ".join(i.classes)
482
try:
483
if isinstance(i.content, list):
484
tag.append(make_html(i.content))
485
elif i.content and i.tag_name != "img":
486
tag.string = i.content
487
488
if i.tag_name == "img":
489
tag.string = ""
490
except AttributeError as exc:
491
# print(i)
492
print(exc, file=sys.stderr)
493
soup.append(tag)
494
return soup
495
496
497
def markdown2html(markdown):
498
return make_html(tokenise(markdown))
499
500