Web platform for sharing free data for ML and research

By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/x-script.python • 13.86 kiB
Python script, ASCII text executable
        
            
1
import re
2
import bs4 as beautifulsoup
3
import sys
4
5
6
def only_chars(string, chars):
7
chars = set(chars)
8
all_chars = set(string)
9
return all_chars.issubset(chars)
10
11
12
inline_regex = r"""
13
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
14
|
15
<(?P<urlDestination2>[^<>]*)> # autolink
16
|
17
(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side
18
|
19
(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side
20
|
21
[``] (?P<textCode2>(?:\\[``]|[^``])*) [``] # inline code (2 backticks)
22
|
23
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
24
|
25
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
26
|
27
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
28
"""
29
30
31
def leading(string, character):
32
return len(string) - len(string.lstrip(character))
33
34
35
def trailing(string, character):
36
return len(string) - len(string.rstrip(character))
37
38
39
class Element:
40
def __init__(self):
41
self.classes = []
42
self.content = None
43
pass
44
45
def __repr__(self):
46
return "Void block"
47
48
@property
49
def tag_name(self):
50
return "m-void"
51
52
53
class Container(Element):
54
def __init__(self, content):
55
super().__init__()
56
self.content = parse_line(content)
57
58
def __repr__(self):
59
return "Generic container element: " + repr(self.content)
60
61
62
class Rule(Element):
63
def __init__(self):
64
super().__init__()
65
66
def __repr__(self):
67
return "Rule"
68
69
@property
70
def tag_name(self):
71
return "hr"
72
73
74
class HardBreak(Element):
75
def __init__(self):
76
super().__init__()
77
78
def __repr__(self):
79
return "Hard break"
80
81
@property
82
def tag_name(self):
83
return "br"
84
85
86
class Heading(Container):
87
def __init__(self, content, level):
88
super().__init__(content)
89
self.level = level
90
pass
91
92
def __repr__(self):
93
return f"Heading level {self.level}:\n\t" + repr(self.content)
94
95
@property
96
def tag_name(self):
97
return "h" + str(self.level)
98
99
100
class Paragraph(Container):
101
def __init__(self, content):
102
super().__init__("")
103
self.content = parse_line(content)
104
105
def __repr__(self):
106
return "Paragraph:\n\t" + repr(self.content)
107
108
@property
109
def tag_name(self):
110
return "p"
111
112
113
class CodeBlock(Element):
114
def __init__(self, content, language="text"):
115
super().__init__()
116
self.content = content
117
self.language = language
118
119
def __repr__(self):
120
return f"Code block ({self.language}):\n\t" + repr(self.content)
121
122
@property
123
def tag_name(self):
124
return "pre"
125
126
127
class UnorderedList(Element):
128
def __init__(self, content):
129
super().__init__()
130
self.content = content
131
132
def __repr__(self):
133
return "Unordered list:\n\t" + repr(self.content)
134
135
@property
136
def tag_name(self):
137
return "ul"
138
139
140
class OrderedList(Element):
141
def __init__(self, content):
142
super().__init__()
143
self.content = content
144
145
def __repr__(self):
146
return "Ordered list:\n\t" + repr(self.content)
147
148
@property
149
def tag_name(self):
150
return "ol"
151
152
153
class ListItem(Element):
154
def __init__(self, content):
155
super().__init__()
156
self.content = tokenise(content)
157
158
def __repr__(self):
159
return "List item:\n\t" + repr(self.content)
160
161
@property
162
def tag_name(self):
163
return "li"
164
165
166
class Blockquote(Paragraph):
167
def __init__(self, content):
168
super().__init__("")
169
self.content = tokenise(content)
170
171
def __repr__(self):
172
return "Blockquote:\n\t" + repr(self.content)
173
174
@property
175
def tag_name(self):
176
return "blockquote"
177
178
179
class Emphasis(Container):
180
def __init__(self, content, value):
181
super().__init__(content)
182
self.value = value
183
if value >= 4:
184
self.classes.append("emphasis-3")
185
if value % 4 >= 2:
186
self.classes.append("emphasis-2")
187
if value % 2:
188
self.classes.append("emphasis-1")
189
190
def __repr__(self):
191
return f"Emphasis ({self.value}): " + repr(self.content)
192
193
@property
194
def tag_name(self):
195
return "em" if self.value == 1 else "strong"
196
197
198
class Code(Element):
199
def __init__(self, content):
200
super().__init__()
201
self.content = [content]
202
203
def __repr__(self):
204
return f"Inline code: {self.content}"
205
206
@property
207
def tag_name(self):
208
return "code"
209
210
211
class Strikethrough(Container):
212
def __init__(self, content):
213
super().__init__(content)
214
215
def __repr__(self):
216
return f"Strikethrough: {repr(self.content)}"
217
218
@property
219
def tag_name(self):
220
return "s"
221
222
223
class Diff(Container):
224
def __init__(self, content, value):
225
super().__init__(content)
226
self.value = value
227
228
def __repr__(self):
229
return f"Diff ({self.value}): {self.content}"
230
231
@property
232
def tag_name(self):
233
return "ins" if self.value == "++" else "del"
234
235
236
class Link(Element):
237
def __init__(self, content, destination, image=False):
238
super().__init__()
239
self.content = parse_line(content)
240
self.destination = destination
241
self.image = image
242
243
def __repr__(self):
244
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
245
246
@property
247
def tag_name(self):
248
return "a"
249
250
251
class Image(Link):
252
def __init__(self, text, destination):
253
super().__init__(text, destination, True)
254
255
@property
256
def tag_name(self):
257
return "img"
258
259
260
def parse_line(source):
261
if trailing(source, "\\") == 1:
262
source = source.rstrip("\\")
263
hard_break = True
264
else:
265
hard_break = False
266
267
tokens = []
268
pattern = re.compile(inline_regex, re.MULTILINE | re.DOTALL | re.VERBOSE)
269
matches = pattern.finditer(source)
270
271
lookup = 0
272
for i in matches:
273
l = i.start()
274
r = i.end()
275
tokens.append(source[lookup:l])
276
277
lookup = r
278
279
if i.group("em"):
280
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
281
if i.group("em2"):
282
tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2"))))
283
if i.group("textCode"):
284
tokens.append(Code(i.group("textCode")))
285
if i.group("textCode2"):
286
tokens.append(Code(i.group("textCode2")))
287
if i.group("strike"):
288
tokens.append(Strikethrough(i.group("textStrike")))
289
if i.group("diff"):
290
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
291
if i.group("urlText"):
292
if i.group("imageFlag"):
293
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
294
else:
295
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
296
if i.group("urlDestination2"):
297
if "://" not in i.group("urlDestination2"):
298
url_text = i.group("urlDestination2").partition(":")[2] # remove tel, mailto, sms prefixes
299
url_destination = i.group("urlDestination2")
300
if url_destination.startswith("mailto:"):
301
url_destination = url_destination.replace("@", "&#64;") # prevent email harvesting
302
url_text = url_text.replace("@", "&#64;") # prevent protocol injection
303
else:
304
url_text = url_destination = i.group("urlDestination2")
305
306
tokens.append(Link(url_text, url_destination))
307
308
tokens.append(source[lookup:])
309
310
if hard_break:
311
tokens.append(HardBreak())
312
313
return tokens
314
315
316
def tokenise(source):
317
tokens = []
318
319
current_block = Element()
320
321
lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces
322
323
i = 0
324
while i < len(lines):
325
line = lines[i]
326
if not line.strip() or line.startswith(";"):
327
# Void block
328
329
tokens.append(current_block)
330
current_block = Element()
331
332
i += 1
333
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
334
# Horizontal rule
335
336
tokens.append(current_block)
337
current_block = Rule()
338
339
i += 1
340
elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
341
if not isinstance(current_block, UnorderedList):
342
tokens.append(current_block)
343
344
content = []
345
346
while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
347
inner_content = lines[i][2:].strip() + "\n" # discard marker and space
348
i += 1
349
while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1] == " "):
350
inner_content += lines[i] + "\n"
351
i += 1
352
353
content.append(ListItem(inner_content))
354
355
current_block = UnorderedList(content)
356
elif re.match(r"^\d+\.", line):
357
if not isinstance(current_block, UnorderedList):
358
tokens.append(current_block)
359
360
content = []
361
362
while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1:
363
inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period
364
i += 1
365
marker_length = len(lines[i].split(".", 1)[0]) + 1
366
while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]):
367
if re.match(r"^ \d+\.", lines[i]):
368
marker_length = 2
369
inner_content += lines[i][marker_length:] + "\n"
370
i += 1
371
372
content.append(ListItem(inner_content))
373
374
current_block = OrderedList(content)
375
elif line.startswith("#") and leading(line.lstrip("#"), " "):
376
tokens.append(current_block)
377
378
content = line.lstrip("#").strip()
379
current_block = Heading(content, leading(line, "#"))
380
381
i += 1
382
elif line.startswith(">"):
383
if not isinstance(current_block, Blockquote):
384
tokens.append(current_block)
385
386
content = ""
387
388
while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
389
content += lines[i].lstrip(">") + "\n"
390
i += 1
391
392
current_block = Blockquote(content)
393
elif leading(line, "~") == 3 or leading(line, "`") == 3:
394
if not isinstance(current_block, CodeBlock):
395
tokens.append(current_block)
396
397
language = line.lstrip("`~").strip()
398
399
content = ""
400
i += 1 # skip the opening fence
401
while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
402
content += lines[i] + "\n"
403
i += 1
404
405
if i < len(lines):
406
i += 1 # prevent a new block from beginning with the closing fence
407
408
current_block = CodeBlock(content, language=language)
409
elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip():
410
tokens.append(current_block)
411
412
content = line.strip()
413
current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2)
414
415
i += 2
416
else:
417
if not isinstance(current_block, Paragraph):
418
# Create a paragraph, if there is no other specifier
419
tokens.append(current_block)
420
421
content = ""
422
423
while (i < len(lines)
424
and not lines[i].startswith("#")
425
and not lines[i].startswith(">")
426
and not lines[i].startswith(";")
427
and not lines[i].startswith("* ")
428
and not lines[i].startswith("+ ")
429
and not lines[i].startswith("- ")
430
and not lines[i].startswith("~~~")
431
and not lines[i].startswith("```")
432
and not re.match(r"^\d+\.", lines[i])
433
and lines[i].strip()):
434
content += lines[i].strip() + "\n"
435
i += 1
436
437
current_block = Paragraph(content)
438
439
tokens.append(current_block)
440
441
return tokens
442
443
444
def make_html(ast):
445
soup = beautifulsoup.BeautifulSoup()
446
for i in ast:
447
# Use bs4 to generate HTML
448
if isinstance(i, str):
449
soup.append(i)
450
elif hasattr(i, "content") and i.tag_name != "m-void":
451
tag = soup.new_tag(str(i.tag_name))
452
if i.tag_name == "a":
453
tag["href"] = i.destination
454
if i.tag_name == "img":
455
tag["src"] = i.destination
456
tag["alt"] = " ".join(i.content)
457
if i.tag_name == "pre":
458
tag["data-language"] = i.language
459
if i.classes:
460
tag["class"] = " ".join(i.classes)
461
try:
462
if isinstance(i.content, list):
463
tag.append(make_html(i.content))
464
elif i.content and i.tag_name != "img":
465
tag.string = i.content
466
467
if i.tag_name == "img":
468
tag.string = ""
469
except AttributeError as exc:
470
# print(i)
471
print(exc, file=sys.stderr)
472
soup.append(tag)
473
return soup
474
475
476
def markdown2html(markdown):
477
return make_html(tokenise(markdown))
478
479