By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/x-script.python • 12.56 kiB
Python script, ASCII text executable
        
            
1
import re
2
import bs4 as beautifulsoup
3
import sys
4
5
6
def only_chars(string, chars):
7
chars = set(chars)
8
all_chars = set(string)
9
return all_chars.issubset(chars)
10
11
12
inlineRegex = r"""
13
(?P<em>[*_]{1,7}) (?P<textEm>(?:\\[*]|[^*])*) (?P=em) # emphasis
14
|
15
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
16
|
17
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
18
|
19
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
20
|
21
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
22
"""
23
24
25
def leading(string, character):
26
return len(string) - len(string.lstrip(character))
27
28
29
def trailing(string, character):
30
return len(string) - len(string.rstrip(character))
31
32
33
class Element:
34
def __init__(self):
35
self.classes = []
36
self.content = None
37
pass
38
39
def __repr__(self):
40
return "Void block"
41
42
@property
43
def tag_name(self):
44
return "m-void"
45
46
47
class Container(Element):
48
def __init__(self, content):
49
super().__init__()
50
self.content = parse_line(content)
51
52
def __repr__(self):
53
return "Generic container element: " + repr(self.content)
54
55
56
class Rule(Element):
57
def __init__(self):
58
super().__init__()
59
60
def __repr__(self):
61
return "Rule"
62
63
@property
64
def tag_name(self):
65
return "hr"
66
67
68
class HardBreak(Element):
69
def __init__(self):
70
super().__init__()
71
72
def __repr__(self):
73
return "Hard break"
74
75
@property
76
def tag_name(self):
77
return "br"
78
79
80
class Heading(Container):
81
def __init__(self, content, level):
82
super().__init__(content)
83
self.level = level
84
pass
85
86
def __repr__(self):
87
return f"Heading level {self.level}:\n\t" + repr(self.content)
88
89
@property
90
def tag_name(self):
91
return "h" + str(self.level)
92
93
94
class Paragraph(Container):
95
def __init__(self, content):
96
super().__init__("")
97
self.content = parse_line(content)
98
99
def __repr__(self):
100
return "Paragraph:\n\t" + repr(self.content)
101
102
@property
103
def tag_name(self):
104
return "p"
105
106
107
class CodeBlock(Element):
108
def __init__(self, content, language="text"):
109
super().__init__()
110
self.content = content
111
self.language = language
112
113
def __repr__(self):
114
return f"Code block ({self.language}):\n\t" + repr(self.content)
115
116
@property
117
def tag_name(self):
118
return "pre"
119
120
121
class UnorderedList(Element):
122
def __init__(self, content):
123
super().__init__()
124
self.content = content
125
126
def __repr__(self):
127
return "Unordered list:\n\t" + repr(self.content)
128
129
@property
130
def tag_name(self):
131
return "ul"
132
133
134
class OrderedList(Element):
135
def __init__(self, content):
136
super().__init__()
137
self.content = content
138
139
def __repr__(self):
140
return "Ordered list:\n\t" + repr(self.content)
141
142
@property
143
def tag_name(self):
144
return "ol"
145
146
147
class ListItem(Element):
148
def __init__(self, content):
149
super().__init__()
150
self.content = tokenise(content)
151
152
def __repr__(self):
153
return "List item:\n\t" + repr(self.content)
154
155
@property
156
def tag_name(self):
157
return "li"
158
159
160
class Blockquote(Paragraph):
161
def __init__(self, content):
162
super().__init__("")
163
self.content = tokenise(content)
164
165
def __repr__(self):
166
return "Blockquote:\n\t" + repr(self.content)
167
168
@property
169
def tag_name(self):
170
return "blockquote"
171
172
173
class Emphasis(Container):
174
def __init__(self, content, value):
175
super().__init__(content)
176
self.value = value
177
if value >= 4:
178
self.classes.append("emphasis-3")
179
if value % 4 >= 2:
180
self.classes.append("emphasis-2")
181
if value % 2:
182
self.classes.append("emphasis-1")
183
184
def __repr__(self):
185
return f"Emphasis ({self.value}): " + repr(self.content)
186
187
@property
188
def tag_name(self):
189
return "em" if self.value == 1 else "strong"
190
191
192
class Code(Element):
193
def __init__(self, content):
194
super().__init__()
195
self.content = [content]
196
197
def __repr__(self):
198
return f"Inline code: {self.content}"
199
200
@property
201
def tag_name(self):
202
return "code"
203
204
205
class Strikethrough(Container):
206
def __init__(self, content):
207
super().__init__(content)
208
209
def __repr__(self):
210
return f"Strikethrough: {repr(self.content)}"
211
212
@property
213
def tag_name(self):
214
return "s"
215
216
217
class Diff(Container):
218
def __init__(self, content, value):
219
super().__init__(content)
220
self.value = value
221
222
def __repr__(self):
223
return f"Diff ({self.value}): {self.content}"
224
225
@property
226
def tag_name(self):
227
return "ins" if self.value == "++" else "del"
228
229
230
class Link(Element):
231
def __init__(self, content, destination, image=False):
232
super().__init__()
233
self.content = content
234
self.destination = destination
235
self.image = image
236
237
def __repr__(self):
238
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
239
240
@property
241
def tag_name(self):
242
return "a"
243
244
245
class Image(Link):
246
def __init__(self, text, destination):
247
super().__init__(text, destination, True)
248
249
@property
250
def tag_name(self):
251
return "img"
252
253
254
def parse_line(source):
255
if trailing(source, "\\") == 1:
256
source = source.rstrip("\\")
257
hard_break = True
258
else:
259
hard_break = False
260
261
tokens = []
262
pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE)
263
matches = pattern.finditer(source)
264
265
lookup = 0
266
for i in matches:
267
l = i.start()
268
r = i.end()
269
tokens.append(source[lookup:l])
270
271
lookup = r
272
273
if i.group("em"):
274
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
275
if i.group("textCode"):
276
tokens.append(Code(i.group("textCode")))
277
if i.group("strike"):
278
tokens.append(Strikethrough(i.group("textStrike")))
279
if i.group("diff"):
280
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
281
if i.group("urlText"):
282
if i.group("imageFlag"):
283
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
284
else:
285
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
286
287
tokens.append(source[lookup:])
288
289
if hard_break:
290
tokens.append(HardBreak())
291
292
return tokens
293
294
295
def tokenise(source):
296
tokens = []
297
298
current_block = Element()
299
300
lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces
301
302
i = 0
303
while i < len(lines):
304
line = lines[i]
305
if not line.strip():
306
# Void block
307
308
tokens.append(current_block)
309
current_block = Element()
310
311
i += 1
312
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
313
# Horizontal rule
314
315
tokens.append(current_block)
316
current_block = Rule()
317
318
i += 1
319
elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
320
if not isinstance(current_block, UnorderedList):
321
tokens.append(current_block)
322
323
content = []
324
325
while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
326
inner_content = lines[i][2:].strip() + "\n" # discard marker and space
327
i += 1
328
while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
329
inner_content += lines[i][1:] + "\n"
330
i += 1
331
332
content.append(ListItem(inner_content))
333
334
current_block = UnorderedList(content)
335
elif re.match(r"^\d+\.", line):
336
if not isinstance(current_block, UnorderedList):
337
tokens.append(current_block)
338
339
content = []
340
341
while i < len(lines) and re.match(r"^\d+\.", line) and len(lines[i].split(".", 1)) > 1:
342
inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period
343
i += 1
344
while i < len(lines) and lines[i].strip() and not re.match(r"^\d+\.", line):
345
inner_content += lines[i] + "\n"
346
i += 1
347
348
content.append(ListItem(inner_content))
349
350
current_block = OrderedList(content)
351
elif line.startswith("#") and leading(line.lstrip("#"), " "):
352
tokens.append(current_block)
353
354
content = line.lstrip("#").strip()
355
current_block = Heading(content, leading(line, "#"))
356
357
i += 1
358
elif line.startswith(">"):
359
if not isinstance(current_block, Blockquote):
360
tokens.append(current_block)
361
362
content = ""
363
364
while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
365
content += lines[i].lstrip(">") + "\n"
366
i += 1
367
368
current_block = Blockquote(content)
369
elif leading(line, "~") == 3 or leading(line, "`") == 3:
370
if not isinstance(current_block, CodeBlock):
371
tokens.append(current_block)
372
373
language = line.lstrip("`~").strip()
374
375
content = ""
376
i += 1 # skip the opening fence
377
while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
378
content += lines[i] + "\n"
379
i += 1
380
381
if i < len(lines):
382
i += 1 # prevent a new block from beginning with the closing fence
383
384
current_block = CodeBlock(content, language=language)
385
elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip():
386
tokens.append(current_block)
387
388
content = line.strip()
389
current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2)
390
391
i += 2
392
else:
393
if not isinstance(current_block, Paragraph):
394
# Create a paragraph, if there is no other specifier
395
tokens.append(current_block)
396
397
content = ""
398
399
while (i < len(lines)
400
and not lines[i].startswith("#")
401
and not lines[i].startswith(">")
402
and not lines[i].startswith("* ")
403
and not lines[i].startswith("+ ")
404
and not lines[i].startswith("- ")
405
and not lines[i].startswith("~~~")
406
and not lines[i].startswith("```")
407
and not re.match(r"^\d+\.", lines[i])
408
and lines[i].strip()):
409
content += lines[i].strip() + "\n"
410
i += 1
411
412
current_block = Paragraph(content)
413
414
tokens.append(current_block)
415
416
return tokens
417
418
419
def make_html(ast):
420
soup = beautifulsoup.BeautifulSoup()
421
for i in ast:
422
# Use bs4 to generate HTML
423
if isinstance(i, str):
424
soup.append(i)
425
elif hasattr(i, "content") and i.tag_name != "m-void":
426
tag = soup.new_tag(str(i.tag_name))
427
if i.tag_name == "a":
428
tag["href"] = i.destination
429
if i.tag_name == "img":
430
tag["src"] = i.destination
431
if i.tag_name == "pre":
432
tag["data-language"] = i.language
433
if i.classes:
434
tag["class"] = " ".join(i.classes)
435
try:
436
if isinstance(i.content, list):
437
tag.append(make_html(i.content))
438
elif i.content:
439
tag.string = i.content
440
except AttributeError as exc:
441
# print(i)
442
print(exc, file=sys.stderr)
443
...
444
soup.append(tag)
445
return soup
446
447
448
if __name__ == '__main__':
449
with open("/home/vlad/roundabout/doc/changelog/0.1.0 (2024-03-31).md") as file:
450
# Generate an AST from a markdown file
451
ast = tokenise(
452
file.read()
453
)
454
# for i in ast:
455
# print(repr(i))
456
457
# Now convert the AST to HTML
458
print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4)))
459
460