By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/x-script.python • 12.91 kiB
Python script, ASCII text executable
        
            
1
import re
2
import bs4 as beautifulsoup
3
import sys
4
5
6
def only_chars(string, chars):
7
chars = set(chars)
8
all_chars = set(string)
9
return all_chars.issubset(chars)
10
11
12
inlineRegex = r"""
13
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
14
|
15
(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side
16
|
17
(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side
18
|
19
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
20
|
21
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
22
|
23
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
24
"""
25
26
27
def leading(string, character):
28
return len(string) - len(string.lstrip(character))
29
30
31
def trailing(string, character):
32
return len(string) - len(string.rstrip(character))
33
34
35
class Element:
36
def __init__(self):
37
self.classes = []
38
self.content = None
39
pass
40
41
def __repr__(self):
42
return "Void block"
43
44
@property
45
def tag_name(self):
46
return "m-void"
47
48
49
class Container(Element):
50
def __init__(self, content):
51
super().__init__()
52
self.content = parse_line(content)
53
54
def __repr__(self):
55
return "Generic container element: " + repr(self.content)
56
57
58
class Rule(Element):
59
def __init__(self):
60
super().__init__()
61
62
def __repr__(self):
63
return "Rule"
64
65
@property
66
def tag_name(self):
67
return "hr"
68
69
70
class HardBreak(Element):
71
def __init__(self):
72
super().__init__()
73
74
def __repr__(self):
75
return "Hard break"
76
77
@property
78
def tag_name(self):
79
return "br"
80
81
82
class Heading(Container):
83
def __init__(self, content, level):
84
super().__init__(content)
85
self.level = level
86
pass
87
88
def __repr__(self):
89
return f"Heading level {self.level}:\n\t" + repr(self.content)
90
91
@property
92
def tag_name(self):
93
return "h" + str(self.level)
94
95
96
class Paragraph(Container):
97
def __init__(self, content):
98
super().__init__("")
99
self.content = parse_line(content)
100
101
def __repr__(self):
102
return "Paragraph:\n\t" + repr(self.content)
103
104
@property
105
def tag_name(self):
106
return "p"
107
108
109
class CodeBlock(Element):
110
def __init__(self, content, language="text"):
111
super().__init__()
112
self.content = content
113
self.language = language
114
115
def __repr__(self):
116
return f"Code block ({self.language}):\n\t" + repr(self.content)
117
118
@property
119
def tag_name(self):
120
return "pre"
121
122
123
class UnorderedList(Element):
124
def __init__(self, content):
125
super().__init__()
126
self.content = content
127
128
def __repr__(self):
129
return "Unordered list:\n\t" + repr(self.content)
130
131
@property
132
def tag_name(self):
133
return "ul"
134
135
136
class OrderedList(Element):
137
def __init__(self, content):
138
super().__init__()
139
self.content = content
140
141
def __repr__(self):
142
return "Ordered list:\n\t" + repr(self.content)
143
144
@property
145
def tag_name(self):
146
return "ol"
147
148
149
class ListItem(Element):
150
def __init__(self, content):
151
super().__init__()
152
self.content = tokenise(content)
153
154
def __repr__(self):
155
return "List item:\n\t" + repr(self.content)
156
157
@property
158
def tag_name(self):
159
return "li"
160
161
162
class Blockquote(Paragraph):
163
def __init__(self, content):
164
super().__init__("")
165
self.content = tokenise(content)
166
167
def __repr__(self):
168
return "Blockquote:\n\t" + repr(self.content)
169
170
@property
171
def tag_name(self):
172
return "blockquote"
173
174
175
class Emphasis(Container):
176
def __init__(self, content, value):
177
super().__init__(content)
178
self.value = value
179
if value >= 4:
180
self.classes.append("emphasis-3")
181
if value % 4 >= 2:
182
self.classes.append("emphasis-2")
183
if value % 2:
184
self.classes.append("emphasis-1")
185
186
def __repr__(self):
187
return f"Emphasis ({self.value}): " + repr(self.content)
188
189
@property
190
def tag_name(self):
191
return "em" if self.value == 1 else "strong"
192
193
194
class Code(Element):
195
def __init__(self, content):
196
super().__init__()
197
self.content = [content]
198
199
def __repr__(self):
200
return f"Inline code: {self.content}"
201
202
@property
203
def tag_name(self):
204
return "code"
205
206
207
class Strikethrough(Container):
208
def __init__(self, content):
209
super().__init__(content)
210
211
def __repr__(self):
212
return f"Strikethrough: {repr(self.content)}"
213
214
@property
215
def tag_name(self):
216
return "s"
217
218
219
class Diff(Container):
220
def __init__(self, content, value):
221
super().__init__(content)
222
self.value = value
223
224
def __repr__(self):
225
return f"Diff ({self.value}): {self.content}"
226
227
@property
228
def tag_name(self):
229
return "ins" if self.value == "++" else "del"
230
231
232
class Link(Element):
233
def __init__(self, content, destination, image=False):
234
super().__init__()
235
self.content = parse_line(content)
236
self.destination = destination
237
self.image = image
238
239
def __repr__(self):
240
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
241
242
@property
243
def tag_name(self):
244
return "a"
245
246
247
class Image(Link):
248
def __init__(self, text, destination):
249
super().__init__(text, destination, True)
250
251
@property
252
def tag_name(self):
253
return "img"
254
255
256
def parse_line(source):
257
if trailing(source, "\\") == 1:
258
source = source.rstrip("\\")
259
hard_break = True
260
else:
261
hard_break = False
262
263
tokens = []
264
pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE)
265
matches = pattern.finditer(source)
266
267
lookup = 0
268
for i in matches:
269
l = i.start()
270
r = i.end()
271
tokens.append(source[lookup:l])
272
273
lookup = r
274
275
if i.group("em"):
276
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
277
if i.group("em2"):
278
tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2"))))
279
if i.group("textCode"):
280
tokens.append(Code(i.group("textCode")))
281
if i.group("strike"):
282
tokens.append(Strikethrough(i.group("textStrike")))
283
if i.group("diff"):
284
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
285
if i.group("urlText"):
286
if i.group("imageFlag"):
287
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
288
else:
289
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
290
291
tokens.append(source[lookup:])
292
293
if hard_break:
294
tokens.append(HardBreak())
295
296
return tokens
297
298
299
def tokenise(source):
300
tokens = []
301
302
current_block = Element()
303
304
lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces
305
306
i = 0
307
while i < len(lines):
308
line = lines[i]
309
if not line.strip() or line.startswith(";"):
310
# Void block
311
312
tokens.append(current_block)
313
current_block = Element()
314
315
i += 1
316
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
317
# Horizontal rule
318
319
tokens.append(current_block)
320
current_block = Rule()
321
322
i += 1
323
elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
324
if not isinstance(current_block, UnorderedList):
325
tokens.append(current_block)
326
327
content = []
328
329
while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
330
inner_content = lines[i][2:].strip() + "\n" # discard marker and space
331
i += 1
332
while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
333
inner_content += lines[i][1:] + "\n"
334
i += 1
335
336
content.append(ListItem(inner_content))
337
338
current_block = UnorderedList(content)
339
elif re.match(r"^\d+\.", line):
340
if not isinstance(current_block, UnorderedList):
341
tokens.append(current_block)
342
343
content = []
344
345
while i < len(lines) and re.match(r"^\d+\.", line) and len(lines[i].split(".", 1)) > 1:
346
inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period
347
i += 1
348
while i < len(lines) and lines[i].strip() and not re.match(r"^\d+\.", line):
349
inner_content += lines[i] + "\n"
350
i += 1
351
352
content.append(ListItem(inner_content))
353
354
current_block = OrderedList(content)
355
elif line.startswith("#") and leading(line.lstrip("#"), " "):
356
tokens.append(current_block)
357
358
content = line.lstrip("#").strip()
359
current_block = Heading(content, leading(line, "#"))
360
361
i += 1
362
elif line.startswith(">"):
363
if not isinstance(current_block, Blockquote):
364
tokens.append(current_block)
365
366
content = ""
367
368
while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
369
content += lines[i].lstrip(">") + "\n"
370
i += 1
371
372
current_block = Blockquote(content)
373
elif leading(line, "~") == 3 or leading(line, "`") == 3:
374
if not isinstance(current_block, CodeBlock):
375
tokens.append(current_block)
376
377
language = line.lstrip("`~").strip()
378
379
content = ""
380
i += 1 # skip the opening fence
381
while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
382
content += lines[i] + "\n"
383
i += 1
384
385
if i < len(lines):
386
i += 1 # prevent a new block from beginning with the closing fence
387
388
current_block = CodeBlock(content, language=language)
389
elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip():
390
tokens.append(current_block)
391
392
content = line.strip()
393
current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2)
394
395
i += 2
396
else:
397
if not isinstance(current_block, Paragraph):
398
# Create a paragraph, if there is no other specifier
399
tokens.append(current_block)
400
401
content = ""
402
403
while (i < len(lines)
404
and not lines[i].startswith("#")
405
and not lines[i].startswith(">")
406
and not lines[i].startswith(";")
407
and not lines[i].startswith("* ")
408
and not lines[i].startswith("+ ")
409
and not lines[i].startswith("- ")
410
and not lines[i].startswith("~~~")
411
and not lines[i].startswith("```")
412
and not re.match(r"^\d+\.", lines[i])
413
and lines[i].strip()):
414
content += lines[i].strip() + "\n"
415
i += 1
416
417
current_block = Paragraph(content)
418
419
tokens.append(current_block)
420
421
return tokens
422
423
424
def make_html(ast):
425
soup = beautifulsoup.BeautifulSoup()
426
for i in ast:
427
# Use bs4 to generate HTML
428
if isinstance(i, str):
429
soup.append(i)
430
elif hasattr(i, "content") and i.tag_name != "m-void":
431
tag = soup.new_tag(str(i.tag_name))
432
if i.tag_name == "a":
433
tag["href"] = i.destination
434
if i.tag_name == "img":
435
tag["src"] = i.destination
436
if i.tag_name == "pre":
437
tag["data-language"] = i.language
438
if i.classes:
439
tag["class"] = " ".join(i.classes)
440
try:
441
if isinstance(i.content, list):
442
tag.append(make_html(i.content))
443
elif i.content:
444
tag.string = i.content
445
except AttributeError as exc:
446
# print(i)
447
print(exc, file=sys.stderr)
448
...
449
soup.append(tag)
450
return soup
451
452
453
if __name__ == '__main__':
454
with open("/home/vlad/roundabout/doc/changelog/0.1.0 (2024-03-31).md") as file:
455
# Generate an AST from a markdown file
456
ast = tokenise(file.read())
457
# for i in ast:
458
# print(repr(i))
459
460
# Now convert the AST to HTML
461
print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4)))
462
463