By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/x-script.python • 13.86 kiB
Python script, ASCII text executable
        
            
1
import re
2
import bs4 as beautifulsoup
3
import sys
4
5
6
def only_chars(string, chars):
7
chars = set(chars)
8
all_chars = set(string)
9
return all_chars.issubset(chars)
10
11
12
inline_regex = r"""
13
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
14
|
15
<(?P<urlDestination2>[^<>]*)> # autolink
16
|
17
(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side
18
|
19
(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side
20
|
21
[``] (?P<textCode2>(?:\\[``]|[^``])*) [``] # inline code (2 backticks)
22
|
23
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
24
|
25
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
26
|
27
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
28
"""
29
30
31
def leading(string, character):
32
return len(string) - len(string.lstrip(character))
33
34
35
def trailing(string, character):
36
return len(string) - len(string.rstrip(character))
37
38
39
class Element:
40
def __init__(self):
41
self.classes = []
42
self.content = None
43
pass
44
45
def __repr__(self):
46
return "Void block"
47
48
@property
49
def tag_name(self):
50
return "m-void"
51
52
53
class Container(Element):
54
def __init__(self, content):
55
super().__init__()
56
self.content = parse_line(content)
57
58
def __repr__(self):
59
return "Generic container element: " + repr(self.content)
60
61
62
class Rule(Element):
63
def __init__(self):
64
super().__init__()
65
66
def __repr__(self):
67
return "Rule"
68
69
@property
70
def tag_name(self):
71
return "hr"
72
73
74
class HardBreak(Element):
75
def __init__(self):
76
super().__init__()
77
78
def __repr__(self):
79
return "Hard break"
80
81
@property
82
def tag_name(self):
83
return "br"
84
85
86
class Heading(Container):
87
def __init__(self, content, level):
88
super().__init__(content)
89
self.level = level
90
pass
91
92
def __repr__(self):
93
return f"Heading level {self.level}:\n\t" + repr(self.content)
94
95
@property
96
def tag_name(self):
97
return "h" + str(self.level)
98
99
100
class Paragraph(Container):
101
def __init__(self, content):
102
super().__init__("")
103
self.content = parse_line(content)
104
105
def __repr__(self):
106
return "Paragraph:\n\t" + repr(self.content)
107
108
@property
109
def tag_name(self):
110
return "p"
111
112
113
class CodeBlock(Element):
114
def __init__(self, content, language="text"):
115
super().__init__()
116
self.content = content
117
self.language = language
118
119
def __repr__(self):
120
return f"Code block ({self.language}):\n\t" + repr(self.content)
121
122
@property
123
def tag_name(self):
124
return "pre"
125
126
127
class UnorderedList(Element):
128
def __init__(self, content):
129
super().__init__()
130
self.content = content
131
132
def __repr__(self):
133
return "Unordered list:\n\t" + repr(self.content)
134
135
@property
136
def tag_name(self):
137
return "ul"
138
139
140
class OrderedList(Element):
141
def __init__(self, content):
142
super().__init__()
143
self.content = content
144
145
def __repr__(self):
146
return "Ordered list:\n\t" + repr(self.content)
147
148
@property
149
def tag_name(self):
150
return "ol"
151
152
153
class ListItem(Element):
154
def __init__(self, content):
155
super().__init__()
156
self.content = tokenise(content)
157
158
def __repr__(self):
159
return "List item:\n\t" + repr(self.content)
160
161
@property
162
def tag_name(self):
163
return "li"
164
165
166
class Blockquote(Paragraph):
167
def __init__(self, content):
168
super().__init__("")
169
self.content = tokenise(content)
170
171
def __repr__(self):
172
return "Blockquote:\n\t" + repr(self.content)
173
174
@property
175
def tag_name(self):
176
return "blockquote"
177
178
179
class Emphasis(Container):
180
def __init__(self, content, value):
181
super().__init__(content)
182
self.value = value
183
if value >= 4:
184
self.classes.append("emphasis-3")
185
if value % 4 >= 2:
186
self.classes.append("emphasis-2")
187
if value % 2:
188
self.classes.append("emphasis-1")
189
190
def __repr__(self):
191
return f"Emphasis ({self.value}): " + repr(self.content)
192
193
@property
194
def tag_name(self):
195
return "em" if self.value == 1 else "strong"
196
197
198
class Code(Element):
199
def __init__(self, content):
200
super().__init__()
201
self.content = [content]
202
203
def __repr__(self):
204
return f"Inline code: {self.content}"
205
206
@property
207
def tag_name(self):
208
return "code"
209
210
211
class Strikethrough(Container):
212
def __init__(self, content):
213
super().__init__(content)
214
215
def __repr__(self):
216
return f"Strikethrough: {repr(self.content)}"
217
218
@property
219
def tag_name(self):
220
return "s"
221
222
223
class Diff(Container):
224
def __init__(self, content, value):
225
super().__init__(content)
226
self.value = value
227
228
def __repr__(self):
229
return f"Diff ({self.value}): {self.content}"
230
231
@property
232
def tag_name(self):
233
return "ins" if self.value == "++" else "del"
234
235
236
class Link(Element):
237
def __init__(self, content, destination, image=False):
238
super().__init__()
239
self.content = parse_line(content)
240
self.destination = destination
241
self.image = image
242
243
def __repr__(self):
244
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
245
246
@property
247
def tag_name(self):
248
return "a"
249
250
251
class Image(Link):
252
def __init__(self, text, destination):
253
super().__init__(text, destination, True)
254
255
@property
256
def tag_name(self):
257
return "img"
258
259
260
def parse_line(source):
261
if trailing(source, "\\") == 1:
262
source = source.rstrip("\\")
263
hard_break = True
264
else:
265
hard_break = False
266
267
tokens = []
268
pattern = re.compile(inline_regex, re.MULTILINE | re.DOTALL | re.VERBOSE)
269
matches = pattern.finditer(source)
270
271
lookup = 0
272
for i in matches:
273
l = i.start()
274
r = i.end()
275
tokens.append(source[lookup:l])
276
277
lookup = r
278
279
if i.group("em"):
280
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
281
if i.group("em2"):
282
tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2"))))
283
if i.group("textCode"):
284
tokens.append(Code(i.group("textCode")))
285
if i.group("textCode2"):
286
tokens.append(Code(i.group("textCode2")))
287
if i.group("strike"):
288
tokens.append(Strikethrough(i.group("textStrike")))
289
if i.group("diff"):
290
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
291
if i.group("urlText"):
292
if i.group("imageFlag"):
293
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
294
else:
295
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
296
if i.group("urlDestination2"):
297
if "://" not in i.group("urlDestination2"):
298
url_text = i.group("urlDestination2").partition(":")[2] # remove tel, mailto, sms prefixes
299
url_destination = i.group("urlDestination2")
300
if url_destination.startswith("mailto:"):
301
url_destination = url_destination.replace("@", "&#64;") # prevent email harvesting
302
url_text = url_text.replace("@", "&#64;") # prevent protocol injection
303
else:
304
url_text = url_destination = i.group("urlDestination2")
305
306
tokens.append(Link(url_text, url_destination))
307
308
tokens.append(source[lookup:])
309
310
if hard_break:
311
tokens.append(HardBreak())
312
313
return tokens
314
315
316
def tokenise(source):
317
tokens = []
318
319
current_block = Element()
320
321
lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces
322
323
i = 0
324
while i < len(lines):
325
line = lines[i]
326
if not line.strip() or line.startswith(";"):
327
# Void block
328
329
tokens.append(current_block)
330
current_block = Element()
331
332
i += 1
333
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
334
# Horizontal rule
335
336
tokens.append(current_block)
337
current_block = Rule()
338
339
i += 1
340
elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
341
if not isinstance(current_block, UnorderedList):
342
tokens.append(current_block)
343
344
content = []
345
346
while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
347
inner_content = lines[i][2:].strip() + "\n" # discard marker and space
348
i += 1
349
while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1] == " "):
350
inner_content += lines[i] + "\n"
351
i += 1
352
353
content.append(ListItem(inner_content))
354
355
current_block = UnorderedList(content)
356
elif re.match(r"^\d+\.", line):
357
if not isinstance(current_block, UnorderedList):
358
tokens.append(current_block)
359
360
content = []
361
362
while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1:
363
inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period
364
i += 1
365
marker_length = len(lines[i].split(".", 1)[0]) + 1
366
while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]):
367
if re.match(r"^ \d+\.", lines[i]):
368
marker_length = 2
369
inner_content += lines[i][marker_length:] + "\n"
370
i += 1
371
372
content.append(ListItem(inner_content))
373
374
current_block = OrderedList(content)
375
elif line.startswith("#") and leading(line.lstrip("#"), " "):
376
tokens.append(current_block)
377
378
content = line.lstrip("#").strip()
379
current_block = Heading(content, leading(line, "#"))
380
381
i += 1
382
elif line.startswith(">"):
383
if not isinstance(current_block, Blockquote):
384
tokens.append(current_block)
385
386
content = ""
387
388
while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
389
content += lines[i].lstrip(">") + "\n"
390
i += 1
391
392
current_block = Blockquote(content)
393
elif leading(line, "~") == 3 or leading(line, "`") == 3:
394
if not isinstance(current_block, CodeBlock):
395
tokens.append(current_block)
396
397
language = line.lstrip("`~").strip()
398
399
content = ""
400
i += 1 # skip the opening fence
401
while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
402
content += lines[i] + "\n"
403
i += 1
404
405
if i < len(lines):
406
i += 1 # prevent a new block from beginning with the closing fence
407
408
current_block = CodeBlock(content, language=language)
409
elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip():
410
tokens.append(current_block)
411
412
content = line.strip()
413
current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2)
414
415
i += 2
416
else:
417
if not isinstance(current_block, Paragraph):
418
# Create a paragraph, if there is no other specifier
419
tokens.append(current_block)
420
421
content = ""
422
423
while (i < len(lines)
424
and not lines[i].startswith("#")
425
and not lines[i].startswith(">")
426
and not lines[i].startswith(";")
427
and not lines[i].startswith("* ")
428
and not lines[i].startswith("+ ")
429
and not lines[i].startswith("- ")
430
and not lines[i].startswith("~~~")
431
and not lines[i].startswith("```")
432
and not re.match(r"^\d+\.", lines[i])
433
and lines[i].strip()):
434
content += lines[i].strip() + "\n"
435
i += 1
436
437
current_block = Paragraph(content)
438
439
tokens.append(current_block)
440
441
return tokens
442
443
444
def make_html(ast):
445
soup = beautifulsoup.BeautifulSoup()
446
for i in ast:
447
# Use bs4 to generate HTML
448
if isinstance(i, str):
449
soup.append(i)
450
elif hasattr(i, "content") and i.tag_name != "m-void":
451
tag = soup.new_tag(str(i.tag_name))
452
if i.tag_name == "a":
453
tag["href"] = i.destination
454
if i.tag_name == "img":
455
tag["src"] = i.destination
456
tag["alt"] = " ".join(i.content)
457
if i.tag_name == "pre":
458
tag["data-language"] = i.language
459
if i.classes:
460
tag["class"] = " ".join(i.classes)
461
try:
462
if isinstance(i.content, list):
463
tag.append(make_html(i.content))
464
elif i.content and i.tag_name != "img":
465
tag.string = i.content
466
467
if i.tag_name == "img":
468
tag.string = ""
469
except AttributeError as exc:
470
# print(i)
471
print(exc, file=sys.stderr)
472
soup.append(tag)
473
return soup
474
475
476
def markdown2html(markdown):
477
return make_html(tokenise(markdown))
478
479