You're looking at it

Homepage: https://roundabout-host.com

By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/plain • 14.79 kiB
Python script, ASCII text executable
        
            
1
"""
2
This is a parser for a Markdown-like language, but it isn't compatible with
3
the CommonMark specification; check doc/enduser/Formatting messages.md for
4
its syntax.
5
6
Roundabout - git hosting for everyone <https://roundabout-host.com>
7
Copyright (C) 2023-2025 Roundabout developers <root@roundabout-host.com>
8
9
This program is free software: you can redistribute it and/or modify
10
it under the terms of the GNU Affero General Public License as published by
11
the Free Software Foundation, either version 3 of the License, or
12
(at your option) any later version.
13
14
This program is distributed in the hope that it will be useful,
15
but WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
GNU Affero General Public License for more details.
18
19
You should have received a copy of the GNU Affero General Public License
20
along with this program. If not, see <http://www.gnu.org/licenses/>.
21
"""
22
23
24
import re
25
import bs4 as beautifulsoup
26
import sys
27
28
29
def only_chars(string, chars):
30
chars = set(chars)
31
all_chars = set(string)
32
return all_chars.issubset(chars)
33
34
35
inline_regex = r"""
36
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
37
|
38
<(?P<urlDestination2>[^<>]*)> # autolink
39
|
40
(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side
41
|
42
(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side
43
|
44
[``] (?P<textCode2>(?:\\[``]|[^``])*) [``] # inline code (2 backticks)
45
|
46
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
47
|
48
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
49
|
50
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
51
"""
52
53
54
def leading(string, character):
55
return len(string) - len(string.lstrip(character))
56
57
58
def trailing(string, character):
59
return len(string) - len(string.rstrip(character))
60
61
62
class Element:
63
def __init__(self):
64
self.classes = []
65
self.content = None
66
pass
67
68
def __repr__(self):
69
return "Void block"
70
71
@property
72
def tag_name(self):
73
return "m-void"
74
75
76
class Container(Element):
77
def __init__(self, content):
78
super().__init__()
79
self.content = parse_line(content)
80
81
def __repr__(self):
82
return "Generic container element: " + repr(self.content)
83
84
85
class Rule(Element):
86
def __init__(self):
87
super().__init__()
88
89
def __repr__(self):
90
return "Rule"
91
92
@property
93
def tag_name(self):
94
return "hr"
95
96
97
class HardBreak(Element):
98
def __init__(self):
99
super().__init__()
100
101
def __repr__(self):
102
return "Hard break"
103
104
@property
105
def tag_name(self):
106
return "br"
107
108
109
class Heading(Container):
110
def __init__(self, content, level):
111
super().__init__(content)
112
self.level = level
113
pass
114
115
def __repr__(self):
116
return f"Heading level {self.level}:\n\t" + repr(self.content)
117
118
@property
119
def tag_name(self):
120
return "h" + str(self.level)
121
122
123
class Paragraph(Container):
124
def __init__(self, content):
125
super().__init__("")
126
self.content = parse_line(content)
127
128
def __repr__(self):
129
return "Paragraph:\n\t" + repr(self.content)
130
131
@property
132
def tag_name(self):
133
return "p"
134
135
136
class CodeBlock(Element):
137
def __init__(self, content, language="text"):
138
super().__init__()
139
self.content = content
140
self.language = language
141
142
def __repr__(self):
143
return f"Code block ({self.language}):\n\t" + repr(self.content)
144
145
@property
146
def tag_name(self):
147
return "pre"
148
149
150
class UnorderedList(Element):
151
def __init__(self, content):
152
super().__init__()
153
self.content = content
154
155
def __repr__(self):
156
return "Unordered list:\n\t" + repr(self.content)
157
158
@property
159
def tag_name(self):
160
return "ul"
161
162
163
class OrderedList(Element):
164
def __init__(self, content):
165
super().__init__()
166
self.content = content
167
168
def __repr__(self):
169
return "Ordered list:\n\t" + repr(self.content)
170
171
@property
172
def tag_name(self):
173
return "ol"
174
175
176
class ListItem(Element):
177
def __init__(self, content):
178
super().__init__()
179
self.content = tokenise(content)
180
181
def __repr__(self):
182
return "List item:\n\t" + repr(self.content)
183
184
@property
185
def tag_name(self):
186
return "li"
187
188
189
class Blockquote(Paragraph):
190
def __init__(self, content):
191
super().__init__("")
192
self.content = tokenise(content)
193
194
def __repr__(self):
195
return "Blockquote:\n\t" + repr(self.content)
196
197
@property
198
def tag_name(self):
199
return "blockquote"
200
201
202
class Emphasis(Container):
203
def __init__(self, content, value):
204
super().__init__(content)
205
self.value = value
206
if value >= 4:
207
self.classes.append("emphasis-3")
208
if value % 4 >= 2:
209
self.classes.append("emphasis-2")
210
if value % 2:
211
self.classes.append("emphasis-1")
212
213
def __repr__(self):
214
return f"Emphasis ({self.value}): " + repr(self.content)
215
216
@property
217
def tag_name(self):
218
return "em" if self.value == 1 else "strong"
219
220
221
class Code(Element):
222
def __init__(self, content):
223
super().__init__()
224
self.content = [content]
225
226
def __repr__(self):
227
return f"Inline code: {self.content}"
228
229
@property
230
def tag_name(self):
231
return "code"
232
233
234
class Strikethrough(Container):
235
def __init__(self, content):
236
super().__init__(content)
237
238
def __repr__(self):
239
return f"Strikethrough: {repr(self.content)}"
240
241
@property
242
def tag_name(self):
243
return "s"
244
245
246
class Diff(Container):
247
def __init__(self, content, value):
248
super().__init__(content)
249
self.value = value
250
251
def __repr__(self):
252
return f"Diff ({self.value}): {self.content}"
253
254
@property
255
def tag_name(self):
256
return "ins" if self.value == "++" else "del"
257
258
259
class Link(Element):
260
def __init__(self, content, destination, image=False):
261
super().__init__()
262
self.content = parse_line(content)
263
self.destination = destination
264
self.image = image
265
266
def __repr__(self):
267
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
268
269
@property
270
def tag_name(self):
271
return "a"
272
273
274
class Image(Link):
275
def __init__(self, text, destination):
276
super().__init__(text, destination, True)
277
278
@property
279
def tag_name(self):
280
return "img"
281
282
283
def parse_line(source):
284
if trailing(source, "\\") == 1:
285
source = source.rstrip("\\")
286
hard_break = True
287
else:
288
hard_break = False
289
290
tokens = []
291
pattern = re.compile(inline_regex, re.MULTILINE | re.DOTALL | re.VERBOSE)
292
matches = pattern.finditer(source)
293
294
lookup = 0
295
for i in matches:
296
l = i.start()
297
r = i.end()
298
tokens.append(source[lookup:l])
299
300
lookup = r
301
302
if i.group("em"):
303
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
304
if i.group("em2"):
305
tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2"))))
306
if i.group("textCode"):
307
tokens.append(Code(i.group("textCode")))
308
if i.group("textCode2"):
309
tokens.append(Code(i.group("textCode2")))
310
if i.group("strike"):
311
tokens.append(Strikethrough(i.group("textStrike")))
312
if i.group("diff"):
313
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
314
if i.group("urlText"):
315
if i.group("imageFlag"):
316
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
317
else:
318
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
319
if i.group("urlDestination2"):
320
if "://" not in i.group("urlDestination2"):
321
url_text = i.group("urlDestination2").partition(":")[2] # remove tel, mailto, sms prefixes
322
url_destination = i.group("urlDestination2")
323
if url_destination.startswith("mailto:"):
324
url_destination = url_destination.replace("@", "&#64;") # prevent email harvesting
325
url_text = url_text.replace("@", "&#64;") # prevent protocol injection
326
else:
327
url_text = url_destination = i.group("urlDestination2")
328
329
tokens.append(Link(url_text, url_destination))
330
331
tokens.append(source[lookup:])
332
333
if hard_break:
334
tokens.append(HardBreak())
335
336
return tokens
337
338
339
def tokenise(source):
340
tokens = []
341
342
current_block = Element()
343
344
lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces
345
346
i = 0
347
while i < len(lines):
348
line = lines[i]
349
if not line.strip() or line.startswith(";"):
350
# Void block
351
352
tokens.append(current_block)
353
current_block = Element()
354
355
i += 1
356
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
357
# Horizontal rule
358
359
tokens.append(current_block)
360
current_block = Rule()
361
362
i += 1
363
elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
364
if not isinstance(current_block, UnorderedList):
365
tokens.append(current_block)
366
367
content = []
368
369
while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
370
inner_content = lines[i][2:].strip() + "\n" # discard marker and space
371
i += 1
372
while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1] == " "):
373
inner_content += lines[i] + "\n"
374
i += 1
375
376
content.append(ListItem(inner_content))
377
378
current_block = UnorderedList(content)
379
elif re.match(r"^\d+\.", line):
380
if not isinstance(current_block, UnorderedList):
381
tokens.append(current_block)
382
383
content = []
384
385
while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1:
386
inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period
387
i += 1
388
marker_length = len(lines[i].split(".", 1)[0]) + 1
389
while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]):
390
if re.match(r"^ \d+\.", lines[i]):
391
marker_length = 2
392
inner_content += lines[i][marker_length:] + "\n"
393
i += 1
394
395
content.append(ListItem(inner_content))
396
397
current_block = OrderedList(content)
398
elif line.startswith("#") and leading(line.lstrip("#"), " "):
399
tokens.append(current_block)
400
401
content = line.lstrip("#").strip()
402
current_block = Heading(content, leading(line, "#"))
403
404
i += 1
405
elif line.startswith(">"):
406
if not isinstance(current_block, Blockquote):
407
tokens.append(current_block)
408
409
content = ""
410
411
while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
412
content += lines[i].lstrip(">") + "\n"
413
i += 1
414
415
current_block = Blockquote(content)
416
elif leading(line, "~") == 3 or leading(line, "`") == 3:
417
if not isinstance(current_block, CodeBlock):
418
tokens.append(current_block)
419
420
language = line.lstrip("`~").strip()
421
422
content = ""
423
i += 1 # skip the opening fence
424
while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
425
content += lines[i] + "\n"
426
i += 1
427
428
if i < len(lines):
429
i += 1 # prevent a new block from beginning with the closing fence
430
431
current_block = CodeBlock(content, language=language)
432
elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip():
433
tokens.append(current_block)
434
435
content = line.strip()
436
current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2)
437
438
i += 2
439
else:
440
if not isinstance(current_block, Paragraph):
441
# Create a paragraph, if there is no other specifier
442
tokens.append(current_block)
443
444
content = ""
445
446
while (i < len(lines)
447
and not lines[i].startswith("#")
448
and not lines[i].startswith(">")
449
and not lines[i].startswith(";")
450
and not lines[i].startswith("* ")
451
and not lines[i].startswith("+ ")
452
and not lines[i].startswith("- ")
453
and not lines[i].startswith("~~~")
454
and not lines[i].startswith("```")
455
and not re.match(r"^\d+\.", lines[i])
456
and lines[i].strip()):
457
content += lines[i].strip() + "\n"
458
i += 1
459
460
current_block = Paragraph(content)
461
462
tokens.append(current_block)
463
464
return tokens
465
466
467
def make_html(ast):
468
soup = beautifulsoup.BeautifulSoup()
469
for i in ast:
470
# Use bs4 to generate HTML
471
if isinstance(i, str):
472
soup.append(i)
473
elif hasattr(i, "content") and i.tag_name != "m-void":
474
tag = soup.new_tag(str(i.tag_name))
475
if i.tag_name == "a":
476
tag["href"] = i.destination
477
if i.tag_name == "img":
478
tag["src"] = i.destination
479
tag["alt"] = " ".join(i.content)
480
if i.tag_name == "pre":
481
tag["data-language"] = i.language
482
if i.classes:
483
tag["class"] = " ".join(i.classes)
484
try:
485
if isinstance(i.content, list):
486
tag.append(make_html(i.content))
487
elif i.content and i.tag_name != "img":
488
tag.string = i.content
489
490
if i.tag_name == "img":
491
tag.string = ""
492
except AttributeError as exc:
493
# print(i)
494
print(exc, file=sys.stderr)
495
soup.append(tag)
496
return soup
497
498
499
def markdown2html(markdown):
500
return make_html(tokenise(markdown))
501
502