By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/x-script.python • 13.78 kiB
Python script, ASCII text executable
        
            
1
import re
2
import bs4 as beautifulsoup
3
import sys
4
5
6
def only_chars(string, chars):
7
chars = set(chars)
8
all_chars = set(string)
9
return all_chars.issubset(chars)
10
11
12
inline_regex = r"""
13
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
14
|
15
<(?P<urlDestination2>[^<>]*)> # autolink
16
|
17
(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side
18
|
19
(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side
20
|
21
[``] (?P<textCode>(?:\\[``]|[^``])*) [``] # inline code (2 backticks)
22
|
23
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
24
|
25
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
26
|
27
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
28
"""
29
30
31
def leading(string, character):
32
return len(string) - len(string.lstrip(character))
33
34
35
def trailing(string, character):
36
return len(string) - len(string.rstrip(character))
37
38
39
class Element:
40
def __init__(self):
41
self.classes = []
42
self.content = None
43
pass
44
45
def __repr__(self):
46
return "Void block"
47
48
@property
49
def tag_name(self):
50
return "m-void"
51
52
53
class Container(Element):
54
def __init__(self, content):
55
super().__init__()
56
self.content = parse_line(content)
57
58
def __repr__(self):
59
return "Generic container element: " + repr(self.content)
60
61
62
class Rule(Element):
63
def __init__(self):
64
super().__init__()
65
66
def __repr__(self):
67
return "Rule"
68
69
@property
70
def tag_name(self):
71
return "hr"
72
73
74
class HardBreak(Element):
75
def __init__(self):
76
super().__init__()
77
78
def __repr__(self):
79
return "Hard break"
80
81
@property
82
def tag_name(self):
83
return "br"
84
85
86
class Heading(Container):
87
def __init__(self, content, level):
88
super().__init__(content)
89
self.level = level
90
pass
91
92
def __repr__(self):
93
return f"Heading level {self.level}:\n\t" + repr(self.content)
94
95
@property
96
def tag_name(self):
97
return "h" + str(self.level)
98
99
100
class Paragraph(Container):
101
def __init__(self, content):
102
super().__init__("")
103
self.content = parse_line(content)
104
105
def __repr__(self):
106
return "Paragraph:\n\t" + repr(self.content)
107
108
@property
109
def tag_name(self):
110
return "p"
111
112
113
class CodeBlock(Element):
114
def __init__(self, content, language="text"):
115
super().__init__()
116
self.content = content
117
self.language = language
118
119
def __repr__(self):
120
return f"Code block ({self.language}):\n\t" + repr(self.content)
121
122
@property
123
def tag_name(self):
124
return "pre"
125
126
127
class UnorderedList(Element):
128
def __init__(self, content):
129
super().__init__()
130
self.content = content
131
132
def __repr__(self):
133
return "Unordered list:\n\t" + repr(self.content)
134
135
@property
136
def tag_name(self):
137
return "ul"
138
139
140
class OrderedList(Element):
141
def __init__(self, content):
142
super().__init__()
143
self.content = content
144
145
def __repr__(self):
146
return "Ordered list:\n\t" + repr(self.content)
147
148
@property
149
def tag_name(self):
150
return "ol"
151
152
153
class ListItem(Element):
154
def __init__(self, content):
155
super().__init__()
156
self.content = tokenise(content)
157
158
def __repr__(self):
159
return "List item:\n\t" + repr(self.content)
160
161
@property
162
def tag_name(self):
163
return "li"
164
165
166
class Blockquote(Paragraph):
167
def __init__(self, content):
168
super().__init__("")
169
self.content = tokenise(content)
170
171
def __repr__(self):
172
return "Blockquote:\n\t" + repr(self.content)
173
174
@property
175
def tag_name(self):
176
return "blockquote"
177
178
179
class Emphasis(Container):
180
def __init__(self, content, value):
181
super().__init__(content)
182
self.value = value
183
if value >= 4:
184
self.classes.append("emphasis-3")
185
if value % 4 >= 2:
186
self.classes.append("emphasis-2")
187
if value % 2:
188
self.classes.append("emphasis-1")
189
190
def __repr__(self):
191
return f"Emphasis ({self.value}): " + repr(self.content)
192
193
@property
194
def tag_name(self):
195
return "em" if self.value == 1 else "strong"
196
197
198
class Code(Element):
199
def __init__(self, content):
200
super().__init__()
201
self.content = [content]
202
203
def __repr__(self):
204
return f"Inline code: {self.content}"
205
206
@property
207
def tag_name(self):
208
return "code"
209
210
211
class Strikethrough(Container):
212
def __init__(self, content):
213
super().__init__(content)
214
215
def __repr__(self):
216
return f"Strikethrough: {repr(self.content)}"
217
218
@property
219
def tag_name(self):
220
return "s"
221
222
223
class Diff(Container):
224
def __init__(self, content, value):
225
super().__init__(content)
226
self.value = value
227
228
def __repr__(self):
229
return f"Diff ({self.value}): {self.content}"
230
231
@property
232
def tag_name(self):
233
return "ins" if self.value == "++" else "del"
234
235
236
class Link(Element):
237
def __init__(self, content, destination, image=False):
238
super().__init__()
239
self.content = parse_line(content)
240
self.destination = destination
241
self.image = image
242
243
def __repr__(self):
244
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
245
246
@property
247
def tag_name(self):
248
return "a"
249
250
251
class Image(Link):
252
def __init__(self, text, destination):
253
super().__init__(text, destination, True)
254
255
@property
256
def tag_name(self):
257
return "img"
258
259
260
def parse_line(source):
261
if trailing(source, "\\") == 1:
262
source = source.rstrip("\\")
263
hard_break = True
264
else:
265
hard_break = False
266
267
tokens = []
268
pattern = re.compile(inline_regex, re.MULTILINE | re.DOTALL | re.VERBOSE)
269
matches = pattern.finditer(source)
270
271
lookup = 0
272
for i in matches:
273
l = i.start()
274
r = i.end()
275
tokens.append(source[lookup:l])
276
277
lookup = r
278
279
if i.group("em"):
280
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
281
if i.group("em2"):
282
tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2"))))
283
if i.group("textCode"):
284
tokens.append(Code(i.group("textCode")))
285
if i.group("strike"):
286
tokens.append(Strikethrough(i.group("textStrike")))
287
if i.group("diff"):
288
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
289
if i.group("urlText"):
290
if i.group("imageFlag"):
291
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
292
else:
293
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
294
if i.group("urlDestination2"):
295
if "://" not in i.group("urlDestination2"):
296
url_text = i.group("urlDestination2").partition(":")[2] # remove tel, mailto, sms prefixes
297
url_destination = i.group("urlDestination2")
298
if url_destination.startswith("mailto:"):
299
url_destination = url_destination.replace("@", "&#64;") # prevent email harvesting
300
url_text = url_text.replace("@", "&#64;") # prevent protocol injection
301
else:
302
url_text = url_destination = i.group("urlDestination2")
303
304
tokens.append(Link(url_text, url_destination))
305
306
tokens.append(source[lookup:])
307
308
if hard_break:
309
tokens.append(HardBreak())
310
311
return tokens
312
313
314
def tokenise(source):
315
tokens = []
316
317
current_block = Element()
318
319
lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces
320
321
i = 0
322
while i < len(lines):
323
line = lines[i]
324
if not line.strip() or line.startswith(";"):
325
# Void block
326
327
tokens.append(current_block)
328
current_block = Element()
329
330
i += 1
331
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
332
# Horizontal rule
333
334
tokens.append(current_block)
335
current_block = Rule()
336
337
i += 1
338
elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
339
if not isinstance(current_block, UnorderedList):
340
tokens.append(current_block)
341
342
content = []
343
344
while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
345
inner_content = lines[i][2:].strip() + "\n" # discard marker and space
346
i += 1
347
while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1] == " "):
348
inner_content += lines[i] + "\n"
349
i += 1
350
351
content.append(ListItem(inner_content))
352
353
current_block = UnorderedList(content)
354
elif re.match(r"^\d+\.", line):
355
if not isinstance(current_block, UnorderedList):
356
tokens.append(current_block)
357
358
content = []
359
360
while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1:
361
inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period
362
i += 1
363
marker_length = len(lines[i].split(".", 1)[0]) + 1
364
while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]):
365
if re.match(r"^ \d+\.", lines[i]):
366
marker_length = 2
367
inner_content += lines[i][marker_length:] + "\n"
368
i += 1
369
370
content.append(ListItem(inner_content))
371
372
current_block = OrderedList(content)
373
elif line.startswith("#") and leading(line.lstrip("#"), " "):
374
tokens.append(current_block)
375
376
content = line.lstrip("#").strip()
377
current_block = Heading(content, leading(line, "#"))
378
379
i += 1
380
elif line.startswith(">"):
381
if not isinstance(current_block, Blockquote):
382
tokens.append(current_block)
383
384
content = ""
385
386
while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
387
content += lines[i].lstrip(">") + "\n"
388
i += 1
389
390
current_block = Blockquote(content)
391
elif leading(line, "~") == 3 or leading(line, "`") == 3:
392
if not isinstance(current_block, CodeBlock):
393
tokens.append(current_block)
394
395
language = line.lstrip("`~").strip()
396
397
content = ""
398
i += 1 # skip the opening fence
399
while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
400
content += lines[i] + "\n"
401
i += 1
402
403
if i < len(lines):
404
i += 1 # prevent a new block from beginning with the closing fence
405
406
current_block = CodeBlock(content, language=language)
407
elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip():
408
tokens.append(current_block)
409
410
content = line.strip()
411
current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2)
412
413
i += 2
414
else:
415
if not isinstance(current_block, Paragraph):
416
# Create a paragraph, if there is no other specifier
417
tokens.append(current_block)
418
419
content = ""
420
421
while (i < len(lines)
422
and not lines[i].startswith("#")
423
and not lines[i].startswith(">")
424
and not lines[i].startswith(";")
425
and not lines[i].startswith("* ")
426
and not lines[i].startswith("+ ")
427
and not lines[i].startswith("- ")
428
and not lines[i].startswith("~~~")
429
and not lines[i].startswith("```")
430
and not re.match(r"^\d+\.", lines[i])
431
and lines[i].strip()):
432
content += lines[i].strip() + "\n"
433
i += 1
434
435
current_block = Paragraph(content)
436
437
tokens.append(current_block)
438
439
return tokens
440
441
442
def make_html(ast):
443
soup = beautifulsoup.BeautifulSoup()
444
for i in ast:
445
# Use bs4 to generate HTML
446
if isinstance(i, str):
447
soup.append(i)
448
elif hasattr(i, "content") and i.tag_name != "m-void":
449
tag = soup.new_tag(str(i.tag_name))
450
if i.tag_name == "a":
451
tag["href"] = i.destination
452
if i.tag_name == "img":
453
tag["src"] = i.destination
454
tag["alt"] = " ".join(i.content)
455
if i.tag_name == "pre":
456
tag["data-language"] = i.language
457
if i.classes:
458
tag["class"] = " ".join(i.classes)
459
try:
460
if isinstance(i.content, list):
461
tag.append(make_html(i.content))
462
elif i.content and i.tag_name != "img":
463
tag.string = i.content
464
465
if i.tag_name == "img":
466
tag.string = ""
467
except AttributeError as exc:
468
# print(i)
469
print(exc, file=sys.stderr)
470
soup.append(tag)
471
return soup
472
473
474
def markdown2html(markdown):
475
return make_html(tokenise(markdown))
476
477