By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/x-script.python • 11.66 kiB
Python script, ASCII text executable
        
            
1
import re
2
import bs4 as beautifulsoup
3
import sys
4
5
6
def only_chars(string, chars):
7
chars = set(chars)
8
all_chars = set(string)
9
return all_chars.issubset(chars)
10
11
12
inlineRegex = r"""
13
(?P<em>[*_]{1,7}) (?P<textEm>(?:\\[*]|[^*])*) (?P=em) # emphasis
14
|
15
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
16
|
17
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
18
|
19
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
20
|
21
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
22
"""
23
24
25
def leading(string, character):
26
return len(string) - len(string.lstrip(character))
27
28
29
def trailing(string, character):
30
return len(string) - len(string.rstrip(character))
31
32
33
class Element:
34
def __init__(self):
35
self.classes = []
36
self.content = None
37
pass
38
39
def __repr__(self):
40
return "Void block"
41
42
@property
43
def tag_name(self):
44
return "m-void"
45
46
47
class Container(Element):
48
def __init__(self, content):
49
super().__init__()
50
self.content = parse_line(content)
51
52
def __repr__(self):
53
return "Generic container element: " + repr(self.content)
54
55
56
class Rule(Element):
57
def __init__(self):
58
super().__init__()
59
60
def __repr__(self):
61
return "Rule"
62
63
@property
64
def tag_name(self):
65
return "hr"
66
67
68
class HardBreak(Element):
69
def __init__(self):
70
super().__init__()
71
72
def __repr__(self):
73
return "Hard break"
74
75
@property
76
def tag_name(self):
77
return "br"
78
79
80
class Heading(Container):
81
def __init__(self, content, level):
82
super().__init__(content)
83
self.level = level
84
pass
85
86
def __repr__(self):
87
return f"Heading level {self.level}:\n\t" + repr(self.content)
88
89
@property
90
def tag_name(self):
91
return "h" + str(self.level)
92
93
94
class Paragraph(Container):
95
def __init__(self, content):
96
super().__init__("")
97
self.content = parse_line(content)
98
99
def __repr__(self):
100
return "Paragraph:\n\t" + repr(self.content)
101
102
@property
103
def tag_name(self):
104
return "p"
105
106
107
class CodeBlock(Element):
108
def __init__(self, content, language="text"):
109
super().__init__()
110
self.content = content
111
self.language = language
112
113
def __repr__(self):
114
return f"Code block ({self.language}):\n\t" + repr(self.content)
115
116
@property
117
def tag_name(self):
118
return "pre"
119
120
121
class UnorderedList(Element):
122
def __init__(self, content):
123
super().__init__()
124
self.content = content
125
126
def __repr__(self):
127
return "Unordered list:\n\t" + repr(self.content)
128
129
@property
130
def tag_name(self):
131
return "ul"
132
133
134
class OrderedList(Element):
135
def __init__(self, content):
136
super().__init__()
137
self.content = content
138
139
def __repr__(self):
140
return "Ordered list:\n\t" + repr(self.content)
141
142
@property
143
def tag_name(self):
144
return "ol"
145
146
147
class ListItem(Paragraph):
148
def __init__(self, content):
149
super().__init__("")
150
self.content = tokenise(content)
151
152
def __repr__(self):
153
return "List item:\n\t" + repr(self.content)
154
155
@property
156
def tag_name(self):
157
return "li"
158
159
160
class Blockquote(Paragraph):
161
def __init__(self, content):
162
super().__init__("")
163
self.content = tokenise(content)
164
165
def __repr__(self):
166
return "Blockquote:\n\t" + repr(self.content)
167
168
@property
169
def tag_name(self):
170
return "blockquote"
171
172
173
class Emphasis(Container):
174
def __init__(self, content, value):
175
super().__init__(content)
176
self.value = value
177
if value >= 4:
178
self.classes.append("emphasis-3")
179
if value % 4 >= 2:
180
self.classes.append("emphasis-2")
181
if value % 2:
182
self.classes.append("emphasis-1")
183
184
def __repr__(self):
185
return f"Emphasis ({self.value}): " + repr(self.content)
186
187
@property
188
def tag_name(self):
189
return "em" if self.value == 1 else "strong"
190
191
192
class Code(Element):
193
def __init__(self, content):
194
super().__init__()
195
self.content = [content]
196
197
def __repr__(self):
198
return f"Inline code: {self.content}"
199
200
@property
201
def tag_name(self):
202
return "code"
203
204
205
class Strikethrough(Container):
206
def __init__(self, content):
207
super().__init__(content)
208
209
def __repr__(self):
210
return f"Strikethrough: {repr(self.content)}"
211
212
@property
213
def tag_name(self):
214
return "s"
215
216
217
class Diff(Container):
218
def __init__(self, content, value):
219
super().__init__(content)
220
self.value = value
221
222
def __repr__(self):
223
return f"Diff ({self.value}): {self.content}"
224
225
@property
226
def tag_name(self):
227
return "ins" if self.value == "++" else "del"
228
229
230
class Link(Element):
231
def __init__(self, content, destination, image=False):
232
super().__init__()
233
self.content = content
234
self.destination = destination
235
self.image = image
236
237
def __repr__(self):
238
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
239
240
@property
241
def tag_name(self):
242
return "a"
243
244
245
class Image(Link):
246
def __init__(self, text, destination):
247
super().__init__(text, destination, True)
248
249
@property
250
def tag_name(self):
251
return "img"
252
253
254
def parse_line(source):
255
if trailing(source, "\\") == 1:
256
source = source.rstrip("\\")
257
hard_break = True
258
else:
259
hard_break = False
260
261
tokens = []
262
pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE)
263
matches = pattern.finditer(source)
264
265
lookup = 0
266
for i in matches:
267
l = i.start()
268
r = i.end()
269
tokens.append(source[lookup:l])
270
271
lookup = r
272
273
if i.group("em"):
274
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
275
if i.group("textCode"):
276
tokens.append(Code(i.group("textCode")))
277
if i.group("strike"):
278
tokens.append(Strikethrough(i.group("textStrike")))
279
if i.group("diff"):
280
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
281
if i.group("urlText"):
282
if i.group("imageFlag"):
283
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
284
else:
285
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
286
287
tokens.append(source[lookup:])
288
289
if hard_break:
290
tokens.append(HardBreak())
291
292
return tokens
293
294
295
def tokenise(source):
296
tokens = []
297
298
current_block = Element()
299
300
lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces
301
302
i = 0
303
while i < len(lines):
304
line = lines[i]
305
if not line.strip():
306
# Void block
307
308
tokens.append(current_block)
309
current_block = Element()
310
311
i += 1
312
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
313
# Horizontal rule
314
315
tokens.append(current_block)
316
current_block = Rule()
317
318
i += 1
319
elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
320
if not isinstance(current_block, UnorderedList):
321
tokens.append(current_block)
322
323
content = []
324
325
while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
326
inner_content = lines[i][2:].strip() + "\n"
327
i += 1
328
while i < len(lines) and lines[i].startswith(" "):
329
inner_content += lines[i][2:] + "\n"
330
i += 1
331
content.append(ListItem(inner_content))
332
333
current_block = UnorderedList(content)
334
elif line.startswith("#") and leading(line.lstrip("#"), " "):
335
tokens.append(current_block)
336
337
content = line.lstrip("#").strip()
338
current_block = Heading(content, leading(line, "#"))
339
340
i += 1
341
elif line.startswith(">"):
342
if not isinstance(current_block, Blockquote):
343
tokens.append(current_block)
344
345
content = ""
346
347
while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
348
content += lines[i].lstrip(">") + "\n"
349
i += 1
350
351
current_block = Blockquote(content)
352
elif leading(line, "~") == 3 or leading(line, "`") == 3:
353
if not isinstance(current_block, CodeBlock):
354
tokens.append(current_block)
355
356
language = line.lstrip("`~").strip()
357
358
content = ""
359
i += 1 # skip the opening fence
360
while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
361
content += lines[i] + "\n"
362
i += 1
363
364
if i < len(lines):
365
i += 1 # prevent a new block from beginning with the closing fence
366
367
current_block = CodeBlock(content, language=language)
368
elif only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-"):
369
tokens.append(current_block)
370
371
content = line.strip()
372
current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2)
373
374
i += 2
375
else:
376
if not isinstance(current_block, Paragraph):
377
# Create a paragraph, if there is no other specifier
378
tokens.append(current_block)
379
380
content = ""
381
382
while (i < len(lines)
383
and not lines[i].startswith("#")
384
and not lines[i].startswith(">")
385
and not lines[i].startswith("* ")
386
and not lines[i].startswith("+ ")
387
and not lines[i].startswith("- ")
388
and not lines[i].startswith("~~~")
389
and not lines[i].startswith("```")
390
and lines[i].strip()):
391
content += lines[i].strip() + "\n"
392
i += 1
393
394
current_block = Paragraph(content)
395
396
tokens.append(current_block)
397
398
return tokens
399
400
401
def make_html(ast):
402
soup = beautifulsoup.BeautifulSoup()
403
for i in ast:
404
# Use bs4 to generate HTML
405
if isinstance(i, str):
406
soup.append(i)
407
elif hasattr(i, "content") and i.tag_name != "m-void":
408
tag = soup.new_tag(str(i.tag_name))
409
if i.tag_name == "a":
410
tag["href"] = i.destination
411
if i.tag_name == "img":
412
tag["src"] = i.destination
413
if i.tag_name == "pre":
414
tag["data-language"] = i.language
415
if i.classes:
416
tag["class"] = " ".join(i.classes)
417
try:
418
if isinstance(i.content, list):
419
tag.append(make_html(i.content))
420
elif i.content:
421
tag.string = i.content
422
except AttributeError as exc:
423
# print(i)
424
print(exc, file=sys.stderr)
425
...
426
soup.append(tag)
427
return soup
428
429
430
if __name__ == '__main__':
431
# Generate an AST from a markdown file
432
ast = tokenise(
433
r"""
434
Hello World
435
===========
436
437
Lorem
438
ipsum
439
dolor
440
sit
441
amet.
442
443
1. Test
444
2. Test
445
3. Test
446
447
* Lorem
448
ipsum
449
* Test
450
* Test
451
"""
452
)
453
# for i in ast:
454
# print(repr(i))
455
456
# Now convert the AST to HTML
457
print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4)))
458
459